diff --git a/.nojekyll b/.nojekyll
new file mode 100644
index 00000000..e69de29b
diff --git a/cache.json b/cache.json
new file mode 100644
index 00000000..e2f3c437
--- /dev/null
+++ b/cache.json
@@ -0,0 +1 @@
+{"2023-07-24T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.12981v1","updated":"2023-07-24T17:59:02Z","published":"2023-07-24T17:59:02Z","title":"3D-LLM: Injecting the 3D World into Large Language Models","summary":"  Large language models (LLMs) and Vision-Language Models (VLMs) have been\nproven to excel at multiple tasks, such as commonsense reasoning. Powerful as\nthese models can be, they are not grounded in the 3D physical world, which\ninvolves richer concepts such as spatial relationships, affordances, physics,\nlayout, and so on. In this work, we propose to inject the 3D world into large\nlanguage models and introduce a whole new family of 3D-LLMs. Specifically,\n3D-LLMs can take 3D point clouds and their features as input and perform a\ndiverse set of 3D-related tasks, including captioning, dense captioning, 3D\nquestion answering, task decomposition, 3D grounding, 3D-assisted dialog,\nnavigation, and so on. Using three types of prompting mechanisms that we\ndesign, we are able to collect over 300k 3D-language data covering these tasks.\nTo efficiently train 3D-LLMs, we first utilize a 3D feature extractor that\nobtains 3D features from rendered multi- view images. Then, we use 2D VLMs as\nour backbones to train our 3D-LLMs. By introducing a 3D localization mechanism,\n3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show\nthat our model outperforms state-of-the-art baselines by a large margin (e.g.,\nthe BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore,\nexperiments on our held-in datasets for 3D captioning, task composition, and\n3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative\nexamples also show that our model could perform more tasks beyond the scope of\nexisting LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.\n","authors":["Yining Hong","Haoyu Zhen","Peihao Chen","Shuhong Zheng","Yilun Du","Zhenfang Chen","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2307.12981v1.pdf","comment":"Project Page: : https://vis-www.cs.umass.edu/3dllm/"},{"id":"http://arxiv.org/abs/2307.12976v1","updated":"2023-07-24T17:52:46Z","published":"2023-07-24T17:52:46Z","title":"Evaluating the Ripple Effects of Knowledge Editing in Language Models","summary":"  Modern language models capture a large body of factual knowledge. However,\nsome facts can be incorrectly induced or become obsolete over time, resulting\nin factually incorrect generations. This has led to the development of various\nediting methods that allow updating facts encoded by the model. Evaluation of\nthese methods has primarily focused on testing whether an individual fact has\nbeen successfully injected, and if similar predictions for other subjects have\nnot changed. Here we argue that such evaluation is limited, since injecting one\nfact (e.g. ``Jack Depp is the son of Johnny Depp'') introduces a ``ripple\neffect'' in the form of additional facts that the model needs to update\n(e.g.``Jack Depp is the sibling of Lily-Rose Depp''). To address this issue, we\npropose a novel set of evaluation criteria that consider the implications of an\nedit on related facts. Using these criteria, we then construct \\ripple{}, a\ndiagnostic benchmark of 5K factual edits, capturing a variety of types of\nripple effects. We evaluate prominent editing methods on \\ripple{}, showing\nthat current methods fail to introduce consistent changes in the model's\nknowledge. In addition, we find that a simple in-context editing baseline\nobtains the best scores on our benchmark, suggesting a promising research\ndirection for model editing.\n","authors":["Roi Cohen","Eden Biran","Ori Yoran","Amir Globerson","Mor Geva"],"pdf_url":"https://arxiv.org/pdf/2307.12976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12973v1","updated":"2023-07-24T17:49:31Z","published":"2023-07-24T17:49:31Z","title":"Leveraging Label Variation in Large Language Models for Zero-Shot Text\n  Classification","summary":"  The zero-shot learning capabilities of large language models (LLMs) make them\nideal for text classification without annotation or supervised training. Many\nstudies have shown impressive results across multiple tasks. While tasks, data,\nand results differ widely, their similarities to human annotation can aid us in\ntackling new tasks with minimal expenses. We evaluate using 5 state-of-the-art\nLLMs as \"annotators\" on 5 different tasks (age, gender, topic, sentiment\nprediction, and hate speech detection), across 4 languages: English, French,\nGerman, and Spanish. No single model excels at all tasks, across languages, or\nacross all labels within a task. However, aggregation techniques designed for\nhuman annotators perform substantially better than any one individual model.\nOverall, though, LLMs do not rival even simple supervised models, so they do\nnot (yet) replace the need for human annotation. We also discuss the tradeoffs\nbetween speed, accuracy, cost, and bias when it comes to aggregated model\nlabeling versus human annotation.\n","authors":["Flor Miriam Plaza-del-Arco","Debora Nozza","Dirk Hovy"],"pdf_url":"https://arxiv.org/pdf/2307.12973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12966v1","updated":"2023-07-24T17:44:58Z","published":"2023-07-24T17:44:58Z","title":"Aligning Large Language Models with Human: A Survey","summary":"  Large Language Models (LLMs) trained on extensive textual corpora have\nemerged as leading solutions for a broad array of Natural Language Processing\n(NLP) tasks. Despite their notable performance, these models are prone to\ncertain limitations such as misunderstanding human instructions, generating\npotentially biased content, or factually incorrect (hallucinated) information.\nHence, aligning LLMs with human expectations has become an active area of\ninterest within the research community. This survey presents a comprehensive\noverview of these alignment technologies, including the following aspects. (1)\nData collection: the methods for effectively collecting high-quality\ninstructions for LLM alignment, including the use of NLP benchmarks, human\nannotations, and leveraging strong LLMs. (2) Training methodologies: a detailed\nreview of the prevailing training methods employed for LLM alignment. Our\nexploration encompasses Supervised Fine-tuning, both Online and Offline human\npreference training, along with parameter-efficient training mechanisms. (3)\nModel Evaluation: the methods for evaluating the effectiveness of these\nhuman-aligned LLMs, presenting a multifaceted approach towards their\nassessment. In conclusion, we collate and distill our findings, shedding light\non several promising future research avenues in the field. This survey,\ntherefore, serves as a valuable resource for anyone invested in understanding\nand advancing the alignment of LLMs to better suit human-oriented tasks and\nexpectations. An associated GitHub link collecting the latest papers is\navailable at https://github.com/GaryYufei/AlignLLMHumanSurvey.\n","authors":["Yufei Wang","Wanjun Zhong","Liangyou Li","Fei Mi","Xingshan Zeng","Wenyong Huang","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2307.12966v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2303.04245v2","updated":"2023-07-24T17:29:04Z","published":"2023-03-07T21:42:17Z","title":"How Do Transformers Learn Topic Structure: Towards a Mechanistic\n  Understanding","summary":"  While the successes of transformers across many domains are indisputable,\naccurate understanding of the learning mechanics is still largely lacking.\nTheir capabilities have been probed on benchmarks which include a variety of\nstructured and reasoning tasks -- but mathematical understanding is lagging\nsubstantially behind. Recent lines of work have begun studying representational\naspects of this question: that is, the size/depth/complexity of attention-based\nnetworks to perform certain tasks. However, there is no guarantee the learning\ndynamics will converge to the constructions proposed. In our paper, we provide\nfine-grained mechanistic understanding of how transformers learn \"semantic\nstructure\", understood as capturing co-occurrence structure of words.\nPrecisely, we show, through a combination of mathematical analysis and\nexperiments on Wikipedia data and synthetic data modeled by Latent Dirichlet\nAllocation (LDA), that the embedding layer and the self-attention layer encode\nthe topical structure. In the former case, this manifests as higher average\ninner product of embeddings between same-topic words. In the latter, it\nmanifests as higher average pairwise attention between same-topic words. The\nmathematical results involve several assumptions to make the analysis\ntractable, which we verify on data, and might be of independent interest as\nwell.\n","authors":["Yuchen Li","Yuanzhi Li","Andrej Risteski"],"pdf_url":"https://arxiv.org/pdf/2303.04245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12950v1","updated":"2023-07-24T17:23:22Z","published":"2023-07-24T17:23:22Z","title":"RLCD: Reinforcement Learning from Contrast Distillation for Language\n  Model Alignment","summary":"  We propose Reinforcement Learning from Contrast Distillation (RLCD), a method\nfor aligning language models to follow natural language principles without\nusing human feedback. RLCD trains a preference model using simulated preference\npairs that contain both a high-quality and low-quality example, generated using\ncontrasting positive and negative prompts. The preference model is then used to\nimprove a base unaligned language model via reinforcement learning.\nEmpirically, RLCD outperforms RLAIF (Bai et al., 2022b) and context\ndistillation (Huang et al., 2022) baselines across three diverse alignment\ntasks--harmlessness, helpfulness, and story outline generation--and on both 7B\nand 30B model scales for preference data simulation.\n","authors":["Kevin Yang","Dan Klein","Asli Celikyilmaz","Nanyun Peng","Yuandong Tian"],"pdf_url":"https://arxiv.org/pdf/2307.12950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12949v1","updated":"2023-07-24T17:22:04Z","published":"2023-07-24T17:22:04Z","title":"Boosting Punctuation Restoration with Data Generation and Reinforcement\n  Learning","summary":"  Punctuation restoration is an important task in automatic speech recognition\n(ASR) which aim to restore the syntactic structure of generated ASR texts to\nimprove readability. While punctuated texts are abundant from written\ndocuments, the discrepancy between written punctuated texts and ASR texts\nlimits the usability of written texts in training punctuation restoration\nsystems for ASR texts. This paper proposes a reinforcement learning method to\nexploit in-topic written texts and recent advances in large pre-trained\ngenerative language models to bridge this gap. The experiments show that our\nmethod achieves state-of-the-art performance on the ASR test set on two\nbenchmark datasets for punctuation restoration.\n","authors":["Viet Dac Lai","Abel Salinas","Hao Tan","Trung Bui","Quan Tran","Seunghyun Yoon","Hanieh Deilamsalehy","Franck Dernoncourt","Thien Huu Nguyen"],"pdf_url":"https://arxiv.org/pdf/2307.12949v1.pdf","comment":"Accepted at INTERSPEECH 2023, 6 pages"},{"id":"http://arxiv.org/abs/2307.12935v1","updated":"2023-07-24T16:55:37Z","published":"2023-07-24T16:55:37Z","title":"Rule By Example: Harnessing Logical Rules for Explainable Hate Speech\n  Detection","summary":"  Classic approaches to content moderation typically apply a rule-based\nheuristic approach to flag content. While rules are easily customizable and\nintuitive for humans to interpret, they are inherently fragile and lack the\nflexibility or robustness needed to moderate the vast amount of undesirable\ncontent found online today. Recent advances in deep learning have demonstrated\nthe promise of using highly effective deep neural models to overcome these\nchallenges. However, despite the improved performance, these data-driven models\nlack transparency and explainability, often leading to mistrust from everyday\nusers and a lack of adoption by many platforms. In this paper, we present Rule\nBy Example (RBE): a novel exemplar-based contrastive learning approach for\nlearning from logical rules for the task of textual content moderation. RBE is\ncapable of providing rule-grounded predictions, allowing for more explainable\nand customizable predictions compared to typical deep learning-based\napproaches. We demonstrate that our approach is capable of learning rich rule\nembedding representations using only a few data examples. Experimental results\non 3 popular hate speech classification datasets show that RBE is able to\noutperform state-of-the-art deep learning classifiers as well as the use of\nrules in both supervised and unsupervised settings while providing explainable\nmodel predictions via rule-grounding.\n","authors":["Christopher Clarke","Matthew Hall","Gaurav Mittal","Ye Yu","Sandra Sajeev","Jason Mars","Mei Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12935v1.pdf","comment":"ACL 2023 Main Conference"},{"id":"http://arxiv.org/abs/2307.12896v1","updated":"2023-07-24T15:44:23Z","published":"2023-07-24T15:44:23Z","title":"Corrections of Zipf's and Heaps' Laws Derived from Hapax Rate Models","summary":"  The article introduces corrections to Zipf's and Heaps' laws based on\nsystematic models of the hapax rate. The derivation rests on two assumptions:\nThe first one is the standard urn model which predicts that marginal frequency\ndistributions for shorter texts look as if word tokens were sampled blindly\nfrom a given longer text. The second assumption posits that the rate of hapaxes\nis a simple function of the text size. Four such functions are discussed: the\nconstant model, the Davis model, the linear model, and the logistic model. It\nis shown that the logistic model yields the best fit.\n","authors":["Łukasz Dębowski"],"pdf_url":"https://arxiv.org/pdf/2307.12896v1.pdf","comment":"41 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2304.08649v3","updated":"2023-07-24T15:33:25Z","published":"2023-04-17T22:53:54Z","title":"Classification of US Supreme Court Cases using BERT-Based Techniques","summary":"  Models based on bidirectional encoder representations from transformers\n(BERT) produce state of the art (SOTA) results on many natural language\nprocessing (NLP) tasks such as named entity recognition (NER), part-of-speech\n(POS) tagging etc. An interesting phenomenon occurs when classifying long\ndocuments such as those from the US supreme court where BERT-based models can\nbe considered difficult to use on a first-pass or out-of-the-box basis. In this\npaper, we experiment with several BERT-based classification techniques for US\nsupreme court decisions or supreme court database (SCDB) and compare them with\nthe previous SOTA results. We then compare our results specifically with SOTA\nmodels for long documents. We compare our results for two classification tasks:\n(1) a broad classification task with 15 categories and (2) a fine-grained\nclassification task with 279 categories. Our best result produces an accuracy\nof 80\\% on the 15 broad categories and 60\\% on the fine-grained 279 categories\nwhich marks an improvement of 8\\% and 28\\% respectively from previously\nreported SOTA results.\n","authors":["Shubham Vatsal","Adam Meyers","John E. Ortega"],"pdf_url":"https://arxiv.org/pdf/2304.08649v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10490v3","updated":"2023-07-24T15:24:17Z","published":"2023-07-19T23:03:20Z","title":"(Ab)using Images and Sounds for Indirect Instruction Injection in\n  Multi-Modal LLMs","summary":"  We demonstrate how images and sounds can be used for indirect prompt and\ninstruction injection in multi-modal LLMs. An attacker generates an adversarial\nperturbation corresponding to the prompt and blends it into an image or audio\nrecording. When the user asks the (unmodified, benign) model about the\nperturbed image or audio, the perturbation steers the model to output the\nattacker-chosen text and/or make the subsequent dialog follow the attacker's\ninstruction. We illustrate this attack with several proof-of-concept examples\ntargeting LLaVa and PandaGPT.\n","authors":["Eugene Bagdasaryan","Tsung-Yin Hsieh","Ben Nassi","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2307.10490v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12856v1","updated":"2023-07-24T14:56:30Z","published":"2023-07-24T14:56:30Z","title":"A Real-World WebAgent with Planning, Long Context Understanding, and\n  Program Synthesis","summary":"  Pre-trained large language models (LLMs) have recently achieved better\ngeneralization and sample efficiency in autonomous web navigation. However, the\nperformance on real-world websites has still suffered from (1) open domainness,\n(2) limited context length, and (3) lack of inductive bias on HTML. We\nintroduce WebAgent, an LLM-driven agent that can complete the tasks on real\nwebsites following natural language instructions. WebAgent plans ahead by\ndecomposing instructions into canonical sub-instructions, summarizes long HTML\ndocuments into task-relevant snippets, and acts on websites via generated\nPython programs from those. We design WebAgent with Flan-U-PaLM, for grounded\ncode generation, and HTML-T5, new pre-trained LLMs for long HTML documents\nusing local and global attention mechanisms and a mixture of long-span\ndenoising objectives, for planning and summarization. We empirically\ndemonstrate that our recipe improves the success on a real website by over 50%,\nand that HTML-T5 is the best model to solve HTML-based tasks; achieving 14.9%\nhigher success rate than prior SoTA on the MiniWoB web navigation benchmark and\nbetter accuracy on offline task planning evaluation.\n","authors":["Izzeddin Gur","Hiroki Furuta","Austin Huang","Mustafa Safdari","Yutaka Matsuo","Douglas Eck","Aleksandra Faust"],"pdf_url":"https://arxiv.org/pdf/2307.12856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12835v1","updated":"2023-07-24T14:33:49Z","published":"2023-07-24T14:33:49Z","title":"Joint Dropout: Improving Generalizability in Low-Resource Neural Machine\n  Translation through Phrase Pair Variables","summary":"  Despite the tremendous success of Neural Machine Translation (NMT), its\nperformance on low-resource language pairs still remains subpar, partly due to\nthe limited ability to handle previously unseen inputs, i.e., generalization.\nIn this paper, we propose a method called Joint Dropout, that addresses the\nchallenge of low-resource neural machine translation by substituting phrases\nwith variables, resulting in significant enhancement of compositionality, which\nis a key aspect of generalization. We observe a substantial improvement in\ntranslation quality for language pairs with minimal resources, as seen in BLEU\nand Direct Assessment scores. Furthermore, we conduct an error analysis, and\nfind Joint Dropout to also enhance generalizability of low-resource NMT in\nterms of robustness and adaptability across different domains\n","authors":["Ali Araabi","Vlad Niculae","Christof Monz"],"pdf_url":"https://arxiv.org/pdf/2307.12835v1.pdf","comment":"Accepted at MT Summit 2023"},{"id":"http://arxiv.org/abs/2307.12803v1","updated":"2023-07-24T13:54:37Z","published":"2023-07-24T13:54:37Z","title":"Guidance in Radiology Report Summarization: An Empirical Evaluation and\n  Error Analysis","summary":"  Automatically summarizing radiology reports into a concise impression can\nreduce the manual burden of clinicians and improve the consistency of\nreporting. Previous work aimed to enhance content selection and factuality\nthrough guided abstractive summarization. However, two key issues persist.\nFirst, current methods heavily rely on domain-specific resources to extract the\nguidance signal, limiting their transferability to domains and languages where\nthose resources are unavailable. Second, while automatic metrics like ROUGE\nshow progress, we lack a good understanding of the errors and failure modes in\nthis task. To bridge these gaps, we first propose a domain-agnostic guidance\nsignal in form of variable-length extractive summaries. Our empirical results\non two English benchmarks demonstrate that this guidance signal improves upon\nunguided summarization while being competitive with domain-specific methods.\nAdditionally, we run an expert evaluation of four systems according to a\ntaxonomy of 11 fine-grained errors. We find that the most pressing differences\nbetween automatic summaries and those of radiologists relate to content\nselection including omissions (up to 52%) and additions (up to 57%). We\nhypothesize that latent reporting factors and corpus-level inconsistencies may\nlimit models to reliably learn content selection from the available data,\npresenting promising directions for future work.\n","authors":["Jan Trienes","Paul Youssef","Jörg Schlötterer","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2307.12803v1.pdf","comment":"Accepted at INLG2023"},{"id":"http://arxiv.org/abs/2307.12798v1","updated":"2023-07-24T13:51:19Z","published":"2023-07-24T13:51:19Z","title":"RRAML: Reinforced Retrieval Augmented Machine Learning","summary":"  The emergence of large language models (LLMs) has revolutionized machine\nlearning and related fields, showcasing remarkable abilities in comprehending,\ngenerating, and manipulating human language. However, their conventional usage\nthrough API-based text prompt submissions imposes certain limitations in terms\nof context constraints and external source availability. To address these\nchallenges, we propose a novel framework called Reinforced Retrieval Augmented\nMachine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs\nwith supporting information retrieved by a purpose-built retriever from a vast\nuser-provided database. By leveraging recent advancements in reinforcement\nlearning, our method effectively addresses several critical challenges.\nFirstly, it circumvents the need for accessing LLM gradients. Secondly, our\nmethod alleviates the burden of retraining LLMs for specific tasks, as it is\noften impractical or impossible due to restricted access to the model and the\ncomputational intensity involved. Additionally we seamlessly link the\nretriever's task with the reasoner, mitigating hallucinations and reducing\nirrelevant, and potentially damaging retrieved documents. We believe that the\nresearch agenda outlined in this paper has the potential to profoundly impact\nthe field of AI, democratizing access to and utilization of LLMs for a wide\nrange of entities.\n","authors":["Andrea Bacciu","Florin Cocunasu","Federico Siciliano","Fabrizio Silvestri","Nicola Tonellotto","Giovanni Trappolini"],"pdf_url":"https://arxiv.org/pdf/2307.12798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.12662v4","updated":"2023-07-24T13:22:58Z","published":"2020-11-25T11:44:12Z","title":"XTQA: Span-Level Explanations of the Textbook Question Answering","summary":"  Textbook Question Answering (TQA) is a task that one should answer a\ndiagram/non-diagram question given a large multi-modal context consisting of\nabundant essays and diagrams. We argue that the explainability of this task\nshould place students as a key aspect to be considered. To address this issue,\nwe devise a novel architecture towards span-level eXplanations of the TQA\n(XTQA) based on our proposed coarse-to-fine grained algorithm, which can\nprovide not only the answers but also the span-level evidences to choose them\nfor students. This algorithm first coarsely chooses top $M$ paragraphs relevant\nto questions using the TF-IDF method, and then chooses top $K$ evidence spans\nfinely from all candidate spans within these paragraphs by computing the\ninformation gain of each span to questions. Experimental results shows that\nXTQA significantly improves the state-of-the-art performance compared with\nbaselines. The source code is available at\nhttps://github.com/keep-smile-001/opentqa\n","authors":["Jie Ma","Qi Chai","Jun Liu","Qingyu Yin","Pinghui Wang","Qinghua Zheng"],"pdf_url":"https://arxiv.org/pdf/2011.12662v4.pdf","comment":"Accepted by IEEE TNNLS"},{"id":"http://arxiv.org/abs/2307.12759v1","updated":"2023-07-24T13:04:21Z","published":"2023-07-24T13:04:21Z","title":"Code-Switched Urdu ASR for Noisy Telephonic Environment using Data\n  Centric Approach with Hybrid HMM and CNN-TDNN","summary":"  Call Centers have huge amount of audio data which can be used for achieving\nvaluable business insights and transcription of phone calls is manually tedious\ntask. An effective Automated Speech Recognition system can accurately\ntranscribe these calls for easy search through call history for specific\ncontext and content allowing automatic call monitoring, improving QoS through\nkeyword search and sentiment analysis. ASR for Call Center requires more\nrobustness as telephonic environment are generally noisy. Moreover, there are\nmany low-resourced languages that are on verge of extinction which can be\npreserved with help of Automatic Speech Recognition Technology. Urdu is the\n$10^{th}$ most widely spoken language in the world, with 231,295,440 worldwide\nstill remains a resource constrained language in ASR. Regional call-center\nconversations operate in local language, with a mix of English numbers and\ntechnical terms generally causing a \"code-switching\" problem. Hence, this paper\ndescribes an implementation framework of a resource efficient Automatic Speech\nRecognition/ Speech to Text System in a noisy call-center environment using\nChain Hybrid HMM and CNN-TDNN for Code-Switched Urdu Language. Using Hybrid\nHMM-DNN approach allowed us to utilize the advantages of Neural Network with\nless labelled data. Adding CNN with TDNN has shown to work better in noisy\nenvironment due to CNN's additional frequency dimension which captures extra\ninformation from noisy speech, thus improving accuracy. We collected data from\nvarious open sources and labelled some of the unlabelled data after analysing\nits general context and content from Urdu language as well as from commonly\nused words from other languages, primarily English and were able to achieve WER\nof 5.2% with noisy as well as clean environment in isolated words or numbers as\nwell as in continuous spontaneous speech.\n","authors":["Muhammad Danyal Khan","Raheem Ali","Arshad Aziz"],"pdf_url":"https://arxiv.org/pdf/2307.12759v1.pdf","comment":"32 pages, 19 figures, 2 tables, preprint"},{"id":"http://arxiv.org/abs/2305.16731v3","updated":"2023-07-24T11:20:10Z","published":"2023-05-26T08:33:28Z","title":"Automatic Emotion Experiencer Recognition","summary":"  The most prominent subtask in emotion analysis is emotion classification; to\nassign a category to a textual unit, for instance a social media post. Many\nresearch questions from the social sciences do, however, not only require the\ndetection of the emotion of an author of a post but to understand who is\nascribed an emotion in text. This task is tackled by emotion role labeling\nwhich aims at extracting who is described in text to experience an emotion,\nwhy, and towards whom. This could, however, be considered overly sophisticated\nif the main question to answer is who feels which emotion. A targeted approach\nfor such setup is to classify emotion experiencer mentions (aka \"emoters\")\nregarding the emotion they presumably perceive. This task is similar to named\nentity recognition of person names with the difference that not every mentioned\nentity name is an emoter. While, very recently, data with emoter annotations\nhas been made available, no experiments have yet been performed to detect such\nmentions. With this paper, we provide baseline experiments to understand how\nchallenging the task is. We further evaluate the impact on experiencer-specific\nemotion categorization and appraisal detection in a pipeline, when gold\nmentions are not available. We show that experiencer detection in text is a\nchallenging task, with a precision of .82 and a recall of .56 (F1 =.66). These\nresults motivate future work of jointly modeling emoter spans and\nemotion/appraisal predictions.\n","authors":["Maximilian Wegge","Roman Klinger"],"pdf_url":"https://arxiv.org/pdf/2305.16731v3.pdf","comment":"accepted to the CPSS workshop at KONVENS"},{"id":"http://arxiv.org/abs/2307.12659v1","updated":"2023-07-24T10:03:28Z","published":"2023-07-24T10:03:28Z","title":"A Model for Every User and Budget: Label-Free and Personalized\n  Mixed-Precision Quantization","summary":"  Recent advancement in Automatic Speech Recognition (ASR) has produced large\nAI models, which become impractical for deployment in mobile devices. Model\nquantization is effective to produce compressed general-purpose models, however\nsuch models may only be deployed to a restricted sub-domain of interest. We\nshow that ASR models can be personalized during quantization while relying on\njust a small set of unlabelled samples from the target domain. To this end, we\npropose myQASR, a mixed-precision quantization method that generates tailored\nquantization schemes for diverse users under any memory requirement with no\nfine-tuning. myQASR automatically evaluates the quantization sensitivity of\nnetwork layers by analysing the full-precision activation values. We are then\nable to generate a personalised mixed-precision quantization scheme for any\npre-determined memory budget. Results for large-scale ASR models show how\nmyQASR improves performance for specific genders, languages, and speakers.\n","authors":["Edward Fish","Umberto Michieli","Mete Ozay"],"pdf_url":"https://arxiv.org/pdf/2307.12659v1.pdf","comment":"INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2301.09790v3","updated":"2023-07-24T10:03:01Z","published":"2023-01-24T02:44:02Z","title":"The Next Chapter: A Study of Large Language Models in Storytelling","summary":"  To enhance the quality of generated stories, recent story generation models\nhave been investigating the utilization of higher-level attributes like plots\nor commonsense knowledge. The application of prompt-based learning with large\nlanguage models (LLMs), exemplified by GPT-3, has exhibited remarkable\nperformance in diverse natural language processing (NLP) tasks. This paper\nconducts a comprehensive investigation, utilizing both automatic and human\nevaluation, to compare the story generation capacity of LLMs with recent models\nacross three datasets with variations in style, register, and length of\nstories. The results demonstrate that LLMs generate stories of significantly\nhigher quality compared to other story generation models. Moreover, they\nexhibit a level of performance that competes with human authors, albeit with\nthe preliminary observation that they tend to replicate real stories in\nsituations involving world knowledge, resembling a form of plagiarism.\n","authors":["Zhuohan Xie","Trevor Cohn","Jey Han Lau"],"pdf_url":"https://arxiv.org/pdf/2301.09790v3.pdf","comment":"Accepted to INLG2023"},{"id":"http://arxiv.org/abs/2304.14721v4","updated":"2023-07-24T09:49:55Z","published":"2023-04-28T09:42:18Z","title":"Towards autonomous system: flexible modular production system enhanced\n  with large language model agents","summary":"  In this paper, we present a novel framework that combines large language\nmodels (LLMs), digital twins and industrial automation system to enable\nintelligent planning and control of production processes. We retrofit the\nautomation system for a modular production facility and create executable\ncontrol interfaces of fine-granular functionalities and coarse-granular skills.\nLow-level functionalities are executed by automation components, and high-level\nskills are performed by automation modules. Subsequently, a digital twin system\nis developed, registering these interfaces and containing additional\ndescriptive information about the production system. Based on the retrofitted\nautomation system and the created digital twins, LLM-agents are designed to\ninterpret descriptive information in the digital twins and control the physical\nsystem through service interfaces. These LLM-agents serve as intelligent agents\non different levels within an automation system, enabling autonomous planning\nand control of flexible production. Given a task instruction as input, the\nLLM-agents orchestrate a sequence of atomic functionalities and skills to\naccomplish the task. We demonstrate how our implemented prototype can handle\nun-predefined tasks, plan a production process, and execute the operations.\nThis research highlights the potential of integrating LLMs into industrial\nautomation systems in the context of smart factory for more agile, flexible,\nand adaptive production processes, while it also underscores the critical\ninsights and limitations for future work. Demos at:\nhttps://github.com/YuchenXia/GPT4IndustrialAutomation\n","authors":["Yuchen Xia","Manthan Shenoy","Nasser Jazdi","Michael Weyrich"],"pdf_url":"https://arxiv.org/pdf/2304.14721v4.pdf","comment":"This is the pre-print draft manuscript. The peer-reviewed version\n  will be published exclusively by IEEE after the conference, which is set to\n  take place from September 12th to 15th, 2023. We've made several improvements\n  to the final version of the paper based on valuable feedback and suggestions\n  from other researchers"},{"id":"http://arxiv.org/abs/2307.12639v1","updated":"2023-07-24T09:30:30Z","published":"2023-07-24T09:30:30Z","title":"Fake News Detection Through Graph-based Neural Networks: A Survey","summary":"  The popularity of online social networks has enabled rapid dissemination of\ninformation. People now can share and consume information much more rapidly\nthan ever before. However, low-quality and/or accidentally/deliberately fake\ninformation can also spread rapidly. This can lead to considerable and negative\nimpacts on society. Identifying, labelling and debunking online misinformation\nas early as possible has become an increasingly urgent problem. Many methods\nhave been proposed to detect fake news including many deep learning and\ngraph-based approaches. In recent years, graph-based methods have yielded\nstrong results, as they can closely model the social context and propagation\nprocess of online news. In this paper, we present a systematic review of fake\nnews detection studies based on graph-based and deep learning-based techniques.\nWe classify existing graph-based methods into knowledge-driven methods,\npropagation-based methods, and heterogeneous social context-based methods,\ndepending on how a graph structure is constructed to model news related\ninformation flows. We further discuss the challenges and open problems in\ngraph-based fake news detection and identify future research directions.\n","authors":["Shuzhi Gong","Richard O. Sinnott","Jianzhong Qi","Cecile Paris"],"pdf_url":"https://arxiv.org/pdf/2307.12639v1.pdf","comment":"18 pages, 3 tables, 7 figures"},{"id":"http://arxiv.org/abs/2210.04676v2","updated":"2023-07-24T09:00:03Z","published":"2022-10-10T13:26:45Z","title":"Learning \"O\" Helps for Learning More: Handling the Concealed Entity\n  Problem for Class-incremental NER","summary":"  As the categories of named entities rapidly increase, the deployed NER models\nare required to keep updating toward recognizing more entity types, creating a\ndemand for class-incremental learning for NER. Considering the privacy concerns\nand storage constraints, the standard paradigm for class-incremental NER\nupdates the models with training data only annotated with the new classes, yet\nthe entities from other entity classes are unlabeled, regarded as \"Non-entity\"\n(or \"O\"). In this work, we conduct an empirical study on the \"Unlabeled Entity\nProblem\" and find that it leads to severe confusion between \"O\" and entities,\ndecreasing class discrimination of old classes and declining the model's\nability to learn new classes. To solve the Unlabeled Entity Problem, we propose\na novel representation learning method to learn discriminative representations\nfor the entity classes and \"O\". Specifically, we propose an entity-aware\ncontrastive learning method that adaptively detects entity clusters in \"O\".\nFurthermore, we propose two effective distance-based relabeling strategies for\nbetter learning the old classes. We introduce a more realistic and challenging\nbenchmark for class-incremental NER, and the proposed method achieves up to\n10.62\\% improvement over the baseline methods.\n","authors":["Ruotian Ma","Xuanting Chen","Lin Zhang","Xin Zhou","Junzhe Wang","Tao Gui","Qi Zhang","Xiang Gao","Yunwen Chen"],"pdf_url":"https://arxiv.org/pdf/2210.04676v2.pdf","comment":"Accepted by ACL 2023"},{"id":"http://arxiv.org/abs/2306.16108v2","updated":"2023-07-24T08:14:44Z","published":"2023-06-28T11:24:48Z","title":"Is ChatGPT a Biomedical Expert? -- Exploring the Zero-Shot Performance\n  of Current GPT Models in Biomedical Tasks","summary":"  We assessed the performance of commercial Large Language Models (LLMs)\nGPT-3.5-Turbo and GPT-4 on tasks from the 2023 BioASQ challenge. In Task 11b\nPhase B, which is focused on answer generation, both models demonstrated\ncompetitive abilities with leading systems. Remarkably, they achieved this with\nsimple zero-shot learning, grounded with relevant snippets. Even without\nrelevant snippets, their performance was decent, though not on par with the\nbest systems. Interestingly, the older and cheaper GPT-3.5-Turbo system was\nable to compete with GPT-4 in the grounded Q&A setting on factoid and list\nanswers. In Task 11b Phase A, focusing on retrieval, query expansion through\nzero-shot learning improved performance, but the models fell short compared to\nother systems. The code needed to rerun these experiments is available through\nGitHub.\n","authors":["Samy Ateia","Udo Kruschwitz"],"pdf_url":"https://arxiv.org/pdf/2306.16108v2.pdf","comment":"Preprint accepted at the 11th BioASQ Workshop at the 14th Conference\n  and Labs of the Evaluation Forum (CLEF) 2023; Changes: 1. Added related work\n  and experimental setup sections. 2. Reworked discussion and future work\n  section. 3. Fixed multiple typos and improved style. Changed license"},{"id":"http://arxiv.org/abs/2307.12573v1","updated":"2023-07-24T07:40:59Z","published":"2023-07-24T07:40:59Z","title":"Tachikuma: Understading Complex Interactions with Multi-Character and\n  Novel Objects by Large Language Models","summary":"  Recent advancements in natural language and Large Language Models (LLMs) have\nenabled AI agents to simulate human-like interactions within virtual worlds.\nHowever, these interactions still face limitations in complexity and\nflexibility, particularly in scenarios involving multiple characters and novel\nobjects. Pre-defining all interactable objects in the agent's world model\npresents challenges, and conveying implicit intentions to multiple characters\nthrough complex interactions remains difficult. To address these issues, we\npropose integrating virtual Game Masters (GMs) into the agent's world model,\ndrawing inspiration from Tabletop Role-Playing Games (TRPGs). GMs play a\ncrucial role in overseeing information, estimating players' intentions,\nproviding environment descriptions, and offering feedback, compensating for\ncurrent world model deficiencies. To facilitate future explorations for complex\ninteractions, we introduce a benchmark named Tachikuma, comprising a Multiple\ncharacter and novel Object based interaction Estimation (MOE) task and a\nsupporting dataset. MOE challenges models to understand characters' intentions\nand accurately determine their actions within intricate contexts involving\nmulti-character and novel object interactions. Besides, the dataset captures\nlog data from real-time communications during gameplay, providing diverse,\ngrounded, and complex interactions for further explorations. Finally, we\npresent a simple prompting baseline and evaluate its performance, demonstrating\nits effectiveness in enhancing interaction understanding. We hope that our\ndataset and task will inspire further research in complex interactions with\nnatural language, fostering the development of more advanced AI agents.\n","authors":["Yuanzhi Liang","Linchao Zhu","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12573v1.pdf","comment":"Preliminary version of an ongoing work"},{"id":"http://arxiv.org/abs/2307.12564v1","updated":"2023-07-24T07:17:33Z","published":"2023-07-24T07:17:33Z","title":"Towards Generalising Neural Topical Representations","summary":"  Topic models have evolved from conventional Bayesian probabilistic models to\nNeural Topic Models (NTMs) over the last two decays. Although NTMs have\nachieved promising performance when trained and tested on a specific corpus,\ntheir generalisation ability across corpora is rarely studied. In practice, we\noften expect that an NTM trained on a source corpus can still produce quality\ntopical representation for documents in a different target corpus without\nretraining. In this work, we aim to improve NTMs further so that their benefits\ngeneralise reliably across corpora and tasks. To do so, we propose to model\nsimilar documents by minimising their semantical distance when training NTMs.\nSpecifically, similar documents are created by data augmentation during\ntraining; The semantical distance between documents is measured by the\nHierarchical Topic Transport Distance (HOTT), which computes the Optimal\nTransport (OT) distance between the topical representations. Our framework can\nbe readily applied to most NTMs as a plug-and-play module. Extensive\nexperiments show that our framework significantly improves the generalisation\nability regarding neural topical representation across corpora.\n","authors":["Xiaohao Yang","He Zhao","Dinh Phung","Lan Du"],"pdf_url":"https://arxiv.org/pdf/2307.12564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.11578v2","updated":"2023-07-24T06:53:10Z","published":"2021-03-22T04:44:43Z","title":"SparseGAN: Sparse Generative Adversarial Network for Text Generation","summary":"  It is still a challenging task to learn a neural text generation model under\nthe framework of generative adversarial networks (GANs) since the entire\ntraining process is not differentiable. The existing training strategies either\nsuffer from unreliable gradient estimations or imprecise sentence\nrepresentations. Inspired by the principle of sparse coding, we propose a\nSparseGAN that generates semantic-interpretable, but sparse sentence\nrepresentations as inputs to the discriminator. The key idea is that we treat\nan embedding matrix as an over-complete dictionary, and use a linear\ncombination of very few selected word embeddings to approximate the output\nfeature representation of the generator at each time step. With such\nsemantic-rich representations, we not only reduce unnecessary noises for\nefficient adversarial training, but also make the entire training process fully\ndifferentiable. Experiments on multiple text generation datasets yield\nperformance improvements, especially in sequence-level metrics, such as BLEU.\n","authors":["Liping Yuan","Jiehang Zeng","Xiaoqing Zheng"],"pdf_url":"https://arxiv.org/pdf/2103.11578v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09710v3","updated":"2023-07-24T05:39:27Z","published":"2022-11-17T17:45:59Z","title":"Style Classification of Rabbinic Literature for Detection of Lost\n  Midrash Tanhuma Material","summary":"  Midrash collections are complex rabbinic works that consist of text in\nmultiple languages, which evolved through long processes of unstable oral and\nwritten transmission. Determining the origin of a given passage in such a\ncompilation is not always straightforward and is often a matter of dispute\namong scholars, yet it is essential for scholars' understanding of the passage\nand its relationship to other texts in the rabbinic corpus. To help solve this\nproblem, we propose a system for classification of rabbinic literature based on\nits style, leveraging recent advances in natural language processing for Hebrew\ntexts. Additionally, we demonstrate how this method can be applied to uncover\nlost material from a specific midrash genre, Tan\\d{h}uma-Yelammedenu, that has\nbeen preserved in later anthologies.\n","authors":["Shlomo Tannor","Nachum Dershowitz","Moshe Lavee"],"pdf_url":"https://arxiv.org/pdf/2211.09710v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12520v1","updated":"2023-07-24T04:29:43Z","published":"2023-07-24T04:29:43Z","title":"Lost In Translation: Generating Adversarial Examples Robust to\n  Round-Trip Translation","summary":"  Language Models today provide a high accuracy across a large number of\ndownstream tasks. However, they remain susceptible to adversarial attacks,\nparticularly against those where the adversarial examples maintain considerable\nsimilarity to the original text. Given the multilingual nature of text, the\neffectiveness of adversarial examples across translations and how machine\ntranslations can improve the robustness of adversarial examples remain largely\nunexplored. In this paper, we present a comprehensive study on the robustness\nof current text adversarial attacks to round-trip translation. We demonstrate\nthat 6 state-of-the-art text-based adversarial attacks do not maintain their\nefficacy after round-trip translation. Furthermore, we introduce an\nintervention-based solution to this problem, by integrating Machine Translation\ninto the process of adversarial example generation and demonstrating increased\nrobustness to round-trip translation. Our results indicate that finding\nadversarial examples robust to translation can help identify the insufficiency\nof language models that is common across languages, and motivate further\nresearch into multilingual adversarial attacks.\n","authors":["Neel Bhandari","Pin-Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12520v1.pdf","comment":"Published at International Conference on Acoustics, Speech, and\n  Signal Processing (ICASSP) 2023"},{"id":"http://arxiv.org/abs/2009.04639v2","updated":"2023-07-24T03:56:31Z","published":"2020-09-10T02:22:21Z","title":"Improving Coreference Resolution by Leveraging Entity-Centric Features\n  with Graph Neural Networks and Second-order Inference","summary":"  One of the major challenges in coreference resolution is how to make use of\nentity-level features defined over clusters of mentions rather than mention\npairs. However, coreferent mentions usually spread far apart in an entire text,\nwhich makes it extremely difficult to incorporate entity-level features. We\npropose a graph neural network-based coreference resolution method that can\ncapture the entity-centric information by encouraging the sharing of features\nacross all mentions that probably refer to the same real-world entity. Mentions\nare linked to each other via the edges modeling how likely two linked mentions\npoint to the same entity. Modeling by such graphs, the features between\nmentions can be shared by message passing operations in an entity-centric\nmanner. A global inference algorithm up to second-order features is also\npresented to optimally cluster mentions into consistent groups. Experimental\nresults show our graph neural network-based method combing with the\nsecond-order decoding algorithm (named GNNCR) achieved close to\nstate-of-the-art performance on the English CoNLL-2012 Shared Task dataset.\n","authors":["Lu Liu","Zhenqiao Song","Xiaoqing Zheng","Jun He"],"pdf_url":"https://arxiv.org/pdf/2009.04639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12507v1","updated":"2023-07-24T03:44:17Z","published":"2023-07-24T03:44:17Z","title":"Investigating the Existence of \"Secret Language'' in Language Models","summary":"  In this paper, we study the problem of secret language in NLP, where current\nlanguage models (LMs) seem to have a hidden vocabulary that allows them to\ninterpret absurd inputs as meaningful concepts. We investigate two research\nquestions: ``Does the secret language phenomenon exist in different language\nmodels?'' and ``Does secret language depend on specific context?'' To answer\nthese questions, we introduce a novel method named \\textit{SecretFinding}, a\ngradient-based approach that can automatically discover secret languages in\nLMs. We conduct experiments on five representative models (Electra, ALBERT,\nRoberta, DistillBERT, and CLIP) finetuned on four NLP benchmarks (SST-2, MRPC,\nSNLI, and SQuAD) and a language-grounding benchmark (MSCOCO). Our experimental\nresults show that even when we replace the most important words with others\nthat are semantically dissimilar to the original words in a sentence, LMs do\nnot consider the new sentence semantically dissimilar to the original, as the\noutput does not change with a high probability. This phenomenon holds true\nacross the five models and five tasks and gives a positive answer to the first\nresearch question. As for the second research question, we find that the secret\nlanguage discovered by \\textit{SecretFinding} is quite general and could even\nbe transferred to other models in the black-box settings, such as GPT-3 and\nChatGPT. Finally, we discuss the causes of secret language, how to eliminate\nit, the potential connection to memorization, and ethical implications.\nExamples of secret language found by SecretFinding are available on\nhttps://huggingface.co/spaces/anonymousauthors/ACL23_SecretLanguage.\n","authors":["Yimu Wang","Peng Shi","Hongyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13040v3","updated":"2023-07-24T03:31:42Z","published":"2023-05-22T13:47:51Z","title":"SpokenWOZ: A Large-Scale Speech-Text Benchmark for Spoken Task-Oriented\n  Dialogue Agents","summary":"  Task-oriented dialogue (TOD) models have made significant progress in recent\nyears. However, previous studies primarily focus on datasets written by\nannotators, which has resulted in a gap between academic research and\nreal-world spoken conversation scenarios. While several small-scale spoken TOD\ndatasets are proposed to address robustness issues such as ASR errors, they\nignore the unique challenges in spoken conversation. To tackle the limitations,\nwe introduce SpokenWOZ, a large-scale speech-text dataset for spoken TOD,\ncontaining 8 domains, 203k turns, 5.7k dialogues and 249 hours of audios from\nhuman-to-human spoken conversations. SpokenWOZ further incorporates common\nspoken characteristics such as word-by-word processing and reasoning in spoken\nlanguage. Based on these characteristics, we present cross-turn slot and\nreasoning slot detection as new challenges. We conduct experiments on various\nbaselines, including text-modal models, newly proposed dual-modal models, and\nLLMs, e.g., ChatGPT. The results show that the current models still have\nsubstantial room for improvement in spoken conversation, where the most\nadvanced dialogue state tracker only achieves 25.65% in joint goal accuracy and\nthe SOTA end-to-end model only correctly completes the user request in 52.1% of\ndialogues. The dataset, code, and leaderboard are available:\nhttps://spokenwoz.github.io/SpokenWOZ-github.io/.\n","authors":["Shuzheng Si","Wentao Ma","Haoyu Gao","Yuchuan Wu","Ting-En Lin","Yinpei Dai","Hangyu Li","Rui Yan","Fei Huang","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2305.13040v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2009.07481v2","updated":"2023-07-24T03:26:17Z","published":"2020-09-16T05:58:00Z","title":"Unsupervised Summarization by Jointly Extracting Sentences and Keywords","summary":"  We present RepRank, an unsupervised graph-based ranking model for extractive\nmulti-document summarization in which the similarity between words, sentences,\nand word-to-sentence can be estimated by the distances between their vector\nrepresentations in a unified vector space. In order to obtain desirable\nrepresentations, we propose a self-attention based learning method that\nrepresent a sentence by the weighted sum of its word embeddings, and the\nweights are concentrated to those words hopefully better reflecting the content\nof a document. We show that salient sentences and keywords can be extracted in\na joint and mutual reinforcement process using our learned representations, and\nprove that this process always converges to a unique solution leading to\nimprovement in performance. A variant of absorbing random walk and the\ncorresponding sampling-based algorithm are also described to avoid redundancy\nand increase diversity in the summaries. Experiment results with multiple\nbenchmark datasets show that RepRank achieved the best or comparable\nperformance in ROUGE.\n","authors":["Zongyi Li","Xiaoqing Zheng","Jun He"],"pdf_url":"https://arxiv.org/pdf/2009.07481v2.pdf","comment":"10 pages(includes 2 pages references), 1 figure"},{"id":"http://arxiv.org/abs/2307.12498v1","updated":"2023-07-24T03:07:40Z","published":"2023-07-24T03:07:40Z","title":"Robust Automatic Speech Recognition via WavAugment Guided Phoneme\n  Adversarial Training","summary":"  Developing a practically-robust automatic speech recognition (ASR) is\nchallenging since the model should not only maintain the original performance\non clean samples, but also achieve consistent efficacy under small volume\nperturbations and large domain shifts. To address this problem, we propose a\nnovel WavAugment Guided Phoneme Adversarial Training (wapat). wapat use\nadversarial examples in phoneme space as augmentation to make the model\ninvariant to minor fluctuations in phoneme representation and preserve the\nperformance on clean samples. In addition, wapat utilizes the phoneme\nrepresentation of augmented samples to guide the generation of adversaries,\nwhich helps to find more stable and diverse gradient-directions, resulting in\nimproved generalization. Extensive experiments demonstrate the effectiveness of\nwapat on End-to-end Speech Challenge Benchmark (ESB). Notably, SpeechLM-wapat\noutperforms the original model by 6.28% WER reduction on ESB, achieving the new\nstate-of-the-art.\n","authors":["Gege Qi","Yuefeng Chen","Xiaofeng Mao","Xiaojun Jia","Ranjie Duan","Rong Zhang","Hui Xue"],"pdf_url":"https://arxiv.org/pdf/2307.12498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11610v2","updated":"2023-07-24T01:35:47Z","published":"2023-07-21T14:25:39Z","title":"CausE: Towards Causal Knowledge Graph Embedding","summary":"  Knowledge graph embedding (KGE) focuses on representing the entities and\nrelations of a knowledge graph (KG) into the continuous vector spaces, which\ncan be employed to predict the missing triples to achieve knowledge graph\ncompletion (KGC). However, KGE models often only briefly learn structural\ncorrelations of triple data and embeddings would be misled by the trivial\npatterns and noisy links in real-world KGs. To address this issue, we build the\nnew paradigm of KGE in the context of causality and embedding disentanglement.\nWe further propose a Causality-enhanced knowledge graph Embedding (CausE)\nframework. CausE employs causal intervention to estimate the causal effect of\nthe confounder embeddings and design new training objectives to make stable\npredictions. Experimental results demonstrate that CausE could outperform the\nbaseline models and achieve state-of-the-art KGC performance. We release our\ncode in https://github.com/zjukg/CausE.\n","authors":["Yichi Zhang","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.11610v2.pdf","comment":"Accepted by CCKS 2023 as a research paper"},{"id":"http://arxiv.org/abs/2306.14096v4","updated":"2023-07-24T00:58:11Z","published":"2023-06-25T02:24:30Z","title":"Chinese Fine-Grained Financial Sentiment Analysis with Large Language\n  Models","summary":"  Entity-level fine-grained sentiment analysis in the financial domain is a\ncrucial subtask of sentiment analysis and currently faces numerous challenges.\nThe primary challenge stems from the lack of high-quality and large-scale\nannotated corpora specifically designed for financial text sentiment analysis,\nwhich in turn limits the availability of data necessary for developing\neffective text processing techniques. Recent advancements in large language\nmodels (LLMs) have yielded remarkable performance in natural language\nprocessing tasks, primarily centered around language pattern matching. In this\npaper, we propose a novel and extensive Chinese fine-grained financial\nsentiment analysis dataset, FinChina SA, for enterprise early warning. We\nthoroughly evaluate and experiment with well-known existing open-source LLMs\nusing our dataset. We firmly believe that our dataset will serve as a valuable\nresource to advance the exploration of real-world financial sentiment analysis\ntasks, which should be the focus of future research. The FinChina SA dataset is\npublicly available at https://github.com/YerayL/FinChina-SA\n","authors":["Yinyu Lan","Yanru Wu","Wang Xu","Weiqiang Feng","Youhao Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.14096v4.pdf","comment":"FinLLM Symposium at IJCAI 2023"},{"id":"http://arxiv.org/abs/2305.01788v3","updated":"2023-07-24T00:54:51Z","published":"2023-05-02T21:33:10Z","title":"Vision Meets Definitions: Unsupervised Visual Word Sense Disambiguation\n  Incorporating Gloss Information","summary":"  Visual Word Sense Disambiguation (VWSD) is a task to find the image that most\naccurately depicts the correct sense of the target word for the given context.\nPreviously, image-text matching models often suffered from recognizing\npolysemous words. This paper introduces an unsupervised VWSD approach that uses\ngloss information of an external lexical knowledge-base, especially the sense\ndefinitions. Specifically, we suggest employing Bayesian inference to\nincorporate the sense definitions when sense information of the answer is not\nprovided. In addition, to ameliorate the out-of-dictionary (OOD) issue, we\npropose a context-aware definition generation with GPT-3. Experimental results\nshow that the VWSD performance significantly increased with our Bayesian\ninference-based approach. In addition, our context-aware definition generation\nachieved prominent performance improvement in OOD examples exhibiting better\nperformance than the existing definition generation method.\n","authors":["Sunjae Kwon","Rishabh Garodia","Minhwa Lee","Zhichao Yang","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2305.01788v3.pdf","comment":"ACL 2023, https://aclanthology.org/2023.acl-long.88"},{"id":"http://arxiv.org/abs/2307.02591v2","updated":"2023-07-24T00:47:23Z","published":"2023-07-05T18:41:29Z","title":"ODD: A Benchmark Dataset for the NLP-based Opioid Related Aberrant\n  Behavior Detection","summary":"  Opioid related aberrant behaviors (ORAB) present novel risk factors for\nopioid overdose. Previously, ORAB have been mainly assessed by survey results\nand by monitoring drug administrations. Such methods however, cannot scale up\nand do not cover the entire spectrum of aberrant behaviors. On the other hand,\nORAB are widely documented in electronic health record notes. This paper\nintroduces a novel biomedical natural language processing benchmark dataset\nnamed ODD, for ORAB Detection Dataset. ODD is an expert-annotated dataset\ncomprising of more than 750 publicly available EHR notes. ODD has been designed\nto identify ORAB from patients' EHR notes and classify them into nine\ncategories; 1) Confirmed Aberrant Behavior, 2) Suggested Aberrant Behavior, 3)\nOpioids, 4) Indication, 5) Diagnosed opioid dependency, 6) Benzodiapines, 7)\nMedication Changes, 8) Central Nervous System-related, and 9) Social\nDeterminants of Health. We explored two state-of-the-art natural language\nprocessing (NLP) models (finetuning pretrained language models and\nprompt-tuning approaches) to identify ORAB. Experimental results show that the\nprompt-tuning models outperformed the finetuning models in most cateogories and\nthe gains were especially higher among uncommon categories (Suggested aberrant\nbehavior, Diagnosed opioid dependency and Medication change). Although the best\nmodel achieved the highest 83.92% on area under precision recall curve,\nuncommon classes (Suggested Aberrant Behavior, Diagnosed Opioid Dependence, and\nMedication Change) still have a large room for performance improvement.\n","authors":["Sunjae Kwon","Xun Wang","Weisong Liu","Emily Druhl","Minhee L. Sung","Joel I. Reisman","Wenjun Li","Robert D. Kerns","William Becker","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2307.02591v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2307.13176v1","updated":"2023-07-24T23:53:13Z","published":"2023-07-24T23:53:13Z","title":"Schema-Driven Actionable Insight Generation and Smart Recommendation","summary":"  In natural language generation (NLG), insight mining is seen as a\ndata-to-text task, where data is mined for interesting patterns and verbalised\ninto 'insight' statements. An 'over-generate and rank' paradigm is intuitively\nused to generate such insights. The multidimensionality and subjectivity of\nthis process make it challenging. This paper introduces a schema-driven method\nto generate actionable insights from data to drive growth and change. It also\nintroduces a technique to rank the insights to align with user interests based\non their feedback. We show preliminary qualitative results of the insights\ngenerated using our technique and demonstrate its ability to adapt to feedback.\n","authors":["Allmin Susaiyah","Aki Härmä","Milan Petković"],"pdf_url":"https://arxiv.org/pdf/2307.13176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13173v1","updated":"2023-07-24T23:42:32Z","published":"2023-07-24T23:42:32Z","title":"Opinion Mining Using Population-tuned Generative Language Models","summary":"  We present a novel method for mining opinions from text collections using\ngenerative language models trained on data collected from different\npopulations. We describe the basic definitions, methodology and a generic\nalgorithm for opinion insight mining. We demonstrate the performance of our\nmethod in an experiment where a pre-trained generative model is fine-tuned\nusing specifically tailored content with unnatural and fully annotated\nopinions. We show that our approach can learn and transfer the opinions to the\nsemantic classes while maintaining the proportion of polarisation. Finally, we\ndemonstrate the usage of an insight mining system to scale up the discovery of\nopinion insights from a real text corpus.\n","authors":["Allmin Susaiyah","Abhinay Pandya","Aki Härmä"],"pdf_url":"https://arxiv.org/pdf/2307.13173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13128v1","updated":"2023-07-24T21:05:47Z","published":"2023-07-24T21:05:47Z","title":"Explaining Math Word Problem Solvers","summary":"  Automated math word problem solvers based on neural networks have\nsuccessfully managed to obtain 70-80\\% accuracy in solving arithmetic word\nproblems. However, it has been shown that these solvers may rely on superficial\npatterns to obtain their equations. In order to determine what information math\nword problem solvers use to generate solutions, we remove parts of the input\nand measure the model's performance on the perturbed dataset. Our results show\nthat the model is not sensitive to the removal of many words from the input and\ncan still manage to find a correct answer when given a nonsense question. This\nindicates that automatic solvers do not follow the semantic logic of math word\nproblems, and may be overfitting to the presence of specific words.\n","authors":["Abby Newcomb","Jugal Kalita"],"pdf_url":"https://arxiv.org/pdf/2307.13128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.15498v2","updated":"2023-07-24T20:08:20Z","published":"2021-06-29T15:25:33Z","title":"Classification of Consumer Belief Statements From Social Media","summary":"  Social media offer plenty of information to perform market research in order\nto meet the requirements of customers. One way how this research is conducted\nis that a domain expert gathers and categorizes user-generated content into a\ncomplex and fine-grained class structure. In many of such cases, little data\nmeets complex annotations. It is not yet fully understood how this can be\nleveraged successfully for classification. We examine the classification\naccuracy of expert labels when used with a) many fine-grained classes and b)\nfew abstract classes. For scenario b) we compare abstract class labels given by\nthe domain expert as baseline and by automatic hierarchical clustering. We\ncompare this to another baseline where the entire class structure is given by a\ncompletely unsupervised clustering approach. By doing so, this work can serve\nas an example of how complex expert annotations are potentially beneficial and\ncan be utilized in the most optimal way for opinion mining in highly specific\ndomains. By exploring across a range of techniques and experiments, we find\nthat automated class abstraction approaches in particular the unsupervised\napproach performs remarkably well against domain expert baseline on text\nclassification tasks. This has the potential to inspire opinion mining\napplications in order to support market researchers in practice and to inspire\nfine-grained automated content analysis on a large scale.\n","authors":["Gerhard Johann Hagerer","Wenbin Le","Hannah Danner","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2106.15498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.10575v2","updated":"2023-07-24T20:07:07Z","published":"2021-10-20T14:04:13Z","title":"SocialVisTUM: An Interactive Visualization Toolkit for Correlated Neural\n  Topic Models on Social Media Opinion Mining","summary":"  Recent research in opinion mining proposed word embedding-based topic\nmodeling methods that provide superior coherence compared to traditional topic\nmodeling. In this paper, we demonstrate how these methods can be used to\ndisplay correlated topic models on social media texts using SocialVisTUM, our\nproposed interactive visualization toolkit. It displays a graph with topics as\nnodes and their correlations as edges. Further details are displayed\ninteractively to support the exploration of large text collections, e.g.,\nrepresentative words and sentences of topics, topic and sentiment\ndistributions, hierarchical topic clustering, and customizable, predefined\ntopic labels. The toolkit optimizes automatically on custom data for optimal\ncoherence. We show a working instance of the toolkit on data crawled from\nEnglish social media discussions about organic food consumption. The\nvisualization confirms findings of a qualitative consumer research study.\nSocialVisTUM and its training procedures are accessible online.\n","authors":["Gerhard Johann Hagerer","Martin Kirchhoff","Hannah Danner","Robert Pesch","Mainak Ghosh","Archishman Roy","Jiaxi Zhao","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2110.10575v2.pdf","comment":"Demo paper accepted for publication on RANLP 2021; 8 pages, 5\n  figures, 1 table"},{"id":"http://arxiv.org/abs/2110.15134v2","updated":"2023-07-24T20:05:38Z","published":"2021-10-28T14:09:44Z","title":"An Analysis of Programming Course Evaluations Before and After the\n  Introduction of an Autograder","summary":"  Commonly, introductory programming courses in higher education institutions\nhave hundreds of participating students eager to learn to program. The manual\neffort for reviewing the submitted source code and for providing feedback can\nno longer be managed. Manually reviewing the submitted homework can be\nsubjective and unfair, particularly if many tutors are responsible for grading.\nDifferent autograders can help in this situation; however, there is a lack of\nknowledge about how autograders can impact students' overall perception of\nprogramming classes and teaching. This is relevant for course organizers and\ninstitutions to keep their programming courses attractive while coping with\nincreasing students.\n  This paper studies the answers to the standardized university evaluation\nquestionnaires of multiple large-scale foundational computer science courses\nwhich recently introduced autograding. The differences before and after this\nintervention are analyzed. By incorporating additional observations, we\nhypothesize how the autograder might have contributed to the significant\nchanges in the data, such as, improved interactions between tutors and\nstudents, improved overall course quality, improved learning success, increased\ntime spent, and reduced difficulty. This qualitative study aims to provide\nhypotheses for future research to define and conduct quantitative surveys and\ndata analysis. The autograder technology can be validated as a teaching method\nto improve student satisfaction with programming courses.\n","authors":["Gerhard Johann Hagerer","Laura Lahesoo","Miriam Anschütz","Stephan Krusche","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2110.15134v2.pdf","comment":"Accepted full paper article on IEEE ITHET 2021"},{"id":"http://arxiv.org/abs/2111.02259v3","updated":"2023-07-24T20:03:14Z","published":"2021-11-03T14:49:50Z","title":"A Case Study and Qualitative Analysis of Simple Cross-Lingual Opinion\n  Mining","summary":"  User-generated content from social media is produced in many languages,\nmaking it technically challenging to compare the discussed themes from one\ndomain across different cultures and regions. It is relevant for domains in a\nglobalized world, such as market research, where people from two nations and\nmarkets might have different requirements for a product. We propose a simple,\nmodern, and effective method for building a single topic model with sentiment\nanalysis capable of covering multiple languages simultanteously, based on a\npre-trained state-of-the-art deep neural network for natural language\nunderstanding. To demonstrate its feasibility, we apply the model to newspaper\narticles and user comments of a specific domain, i.e., organic food products\nand related consumption behavior. The themes match across languages.\nAdditionally, we obtain an high proportion of stable and domain-relevant\ntopics, a meaningful relation between topics and their respective textual\ncontents, and an interpretable representation for social media documents.\nMarketing can potentially benefit from our method, since it provides an\neasy-to-use means of addressing specific customer interests from different\nmarket regions around the globe. For reproducibility, we provide the code,\ndata, and results of our study.\n","authors":["Gerhard Johann Hagerer","Wing Sheung Leung","Qiaoxi Liu","Hannah Danner","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2111.02259v3.pdf","comment":"10 pages, 2 tables, 5 figures, full paper, peer-reviewed, published\n  at KDIR/IC3k 2021 conference"},{"id":"http://arxiv.org/abs/2307.13106v1","updated":"2023-07-24T19:54:15Z","published":"2023-07-24T19:54:15Z","title":"How to use LLMs for Text Analysis","summary":"  This guide introduces Large Language Models (LLM) as a highly versatile text\nanalysis method within the social sciences. As LLMs are easy-to-use, cheap,\nfast, and applicable on a broad range of text analysis tasks, ranging from text\nannotation and classification to sentiment analysis and critical discourse\nanalysis, many scholars believe that LLMs will transform how we do text\nanalysis. This how-to guide is aimed at students and researchers with limited\nprogramming experience, and offers a simple introduction to how LLMs can be\nused for text analysis in your own research project, as well as advice on best\npractices. We will go through each of the steps of analyzing textual data with\nLLMs using Python: installing the software, setting up the API, loading the\ndata, developing an analysis prompt, analyzing the text, and validating the\nresults. As an illustrative example, we will use the challenging task of\nidentifying populism in political texts, and show how LLMs move beyond the\nexisting state-of-the-art.\n","authors":["Petter Törnberg"],"pdf_url":"https://arxiv.org/pdf/2307.13106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.02326v2","updated":"2023-07-24T19:44:53Z","published":"2021-11-03T16:20:16Z","title":"End-to-End Annotator Bias Approximation on Crowdsourced Single-Label\n  Sentiment Analysis","summary":"  Sentiment analysis is often a crowdsourcing task prone to subjective labels\ngiven by many annotators. It is not yet fully understood how the annotation\nbias of each annotator can be modeled correctly with state-of-the-art methods.\nHowever, resolving annotator bias precisely and reliably is the key to\nunderstand annotators' labeling behavior and to successfully resolve\ncorresponding individual misconceptions and wrongdoings regarding the\nannotation task. Our contribution is an explanation and improvement for precise\nneural end-to-end bias modeling and ground truth estimation, which reduces an\nundesired mismatch in that regard of the existing state-of-the-art.\nClassification experiments show that it has potential to improve accuracy in\ncases where each sample is annotated only by one single annotator. We provide\nthe whole source code publicly and release an own domain-specific sentiment\ndataset containing 10,000 sentences discussing organic food products. These are\ncrawled from social media and are singly labeled by 10 non-expert annotators.\n","authors":["Gerhard Johann Hagerer","David Szabo","Andreas Koch","Maria Luisa Ripoll Dominguez","Christian Widmer","Maximilian Wich","Hannah Danner","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2111.02326v2.pdf","comment":"10 pages, 2 figures, 2 tables, full conference paper, peer-reviewed"},{"id":"http://arxiv.org/abs/2305.17008v2","updated":"2023-07-24T19:18:25Z","published":"2023-05-26T15:09:11Z","title":"NormBank: A Knowledge Bank of Situational Social Norms","summary":"  We present NormBank, a knowledge bank of 155k situational norms. This\nresource is designed to ground flexible normative reasoning for interactive,\nassistive, and collaborative AI systems. Unlike prior commonsense resources,\nNormBank grounds each inference within a multivalent sociocultural frame, which\nincludes the setting (e.g., restaurant), the agents' contingent roles (waiter,\ncustomer), their attributes (age, gender), and other physical, social, and\ncultural constraints (e.g., the temperature or the country of operation). In\ntotal, NormBank contains 63k unique constraints from a taxonomy that we\nintroduce and iteratively refine here. Constraints then apply in different\ncombinations to frame social norms. Under these manipulations, norms are\nnon-monotonic - one can cancel an inference by updating its frame even\nslightly. Still, we find evidence that neural models can help reliably extend\nthe scope and coverage of NormBank. We further demonstrate the utility of this\nresource with a series of transfer experiments.\n","authors":["Caleb Ziems","Jane Dwivedi-Yu","Yi-Chia Wang","Alon Halevy","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2305.17008v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13085v1","updated":"2023-07-24T19:14:38Z","published":"2023-07-24T19:14:38Z","title":"Making Metadata More FAIR Using Large Language Models","summary":"  With the global increase in experimental data artifacts, harnessing them in a\nunified fashion leads to a major stumbling block - bad metadata. To bridge this\ngap, this work presents a Natural Language Processing (NLP) informed\napplication, called FAIRMetaText, that compares metadata. Specifically,\nFAIRMetaText analyzes the natural language descriptions of metadata and\nprovides a mathematical similarity measure between two terms. This measure can\nthen be utilized for analyzing varied metadata, by suggesting terms for\ncompliance or grouping similar terms for identification of replaceable terms.\nThe efficacy of the algorithm is presented qualitatively and quantitatively on\npublicly available research artifacts and demonstrates large gains across\nmetadata related tasks through an in-depth study of a wide variety of Large\nLanguage Models (LLMs). This software can drastically reduce the human effort\nin sifting through various natural language metadata while employing several\nexperimental datasets on the same topic.\n","authors":["Sowmya S. Sundaram","Mark A. Musen"],"pdf_url":"https://arxiv.org/pdf/2307.13085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00017v2","updated":"2023-07-24T18:46:22Z","published":"2023-05-30T15:15:40Z","title":"Towards Explainable and Language-Agnostic LLMs: Symbolic Reverse\n  Engineering of Language at Scale","summary":"  Large language models (LLMs) have achieved a milestone that undenia-bly\nchanged many held beliefs in artificial intelligence (AI). However, there\nremains many limitations of these LLMs when it comes to true language\nunderstanding, limitations that are a byproduct of the under-lying architecture\nof deep neural networks. Moreover, and due to their subsymbolic nature,\nwhatever knowledge these models acquire about how language works will always be\nburied in billions of microfeatures (weights), none of which is meaningful on\nits own, making such models hopelessly unexplainable. To address these\nlimitations, we suggest com-bining the strength of symbolic representations\nwith what we believe to be the key to the success of LLMs, namely a successful\nbottom-up re-verse engineering of language at scale. As such we argue for a\nbottom-up reverse engineering of language in a symbolic setting. Hints on what\nthis project amounts to have been suggested by several authors, and we discuss\nin some detail here how this project could be accomplished.\n","authors":["Walid S. Saba"],"pdf_url":"https://arxiv.org/pdf/2306.00017v2.pdf","comment":"Draft, preprint"},{"id":"http://arxiv.org/abs/2307.13018v1","updated":"2023-07-24T17:17:13Z","published":"2023-07-24T17:17:13Z","title":"The potential of LLMs for coding with low-resource and domain-specific\n  programming languages","summary":"  This paper presents a study on the feasibility of using large language models\n(LLM) for coding with low-resource and domain-specific programming languages\nthat typically lack the amount of data required for effective LLM processing\ntechniques. This study focuses on the econometric scripting language named\nhansl of the open-source software gretl and employs a proprietary LLM based on\nGPT-3.5. Our findings suggest that LLMs can be a useful tool for writing,\nunderstanding, improving, and documenting gretl code, which includes generating\ndescriptive docstrings for functions and providing precise explanations for\nabstract and poorly documented econometric code. While the LLM showcased\npromoting docstring-to-code translation capability, we also identify some\nlimitations, such as its inability to improve certain sections of code and to\nwrite accurate unit tests. This study is a step towards leveraging the power of\nLLMs to facilitate software development in low-resource programming languages\nand ultimately to lower barriers to entry for their adoption.\n","authors":["Artur Tarassow"],"pdf_url":"https://arxiv.org/pdf/2307.13018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14361v1","updated":"2023-07-24T21:01:46Z","published":"2023-07-24T21:01:46Z","title":"A Hybrid Machine Learning Model for Classifying Gene Mutations in Cancer\n  using LSTM, BiLSTM, CNN, GRU, and GloVe","summary":"  This study presents an ensemble model combining LSTM, BiLSTM, CNN, GRU, and\nGloVe to classify gene mutations using Kaggle's Personalized Medicine:\nRedefining Cancer Treatment dataset. The results were compared against\nwell-known transformers like as BERT, Electra, Roberta, XLNet, Distilbert, and\ntheir LSTM ensembles. Our model outperformed all other models in terms of\naccuracy, precision, recall, F1 score, and Mean Squared Error. Surprisingly, it\nalso needed less training time, resulting in a perfect combination of\nperformance and efficiency. This study demonstrates the utility of ensemble\nmodels for difficult tasks such as gene mutation classification.\n","authors":["Sanad Aburass","Osama Dorgham","Jamil Al Shaqsi"],"pdf_url":"https://arxiv.org/pdf/2307.14361v1.pdf","comment":"6 pages, 7 figures and 2 tables"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.12981v1","updated":"2023-07-24T17:59:02Z","published":"2023-07-24T17:59:02Z","title":"3D-LLM: Injecting the 3D World into Large Language Models","summary":"  Large language models (LLMs) and Vision-Language Models (VLMs) have been\nproven to excel at multiple tasks, such as commonsense reasoning. Powerful as\nthese models can be, they are not grounded in the 3D physical world, which\ninvolves richer concepts such as spatial relationships, affordances, physics,\nlayout, and so on. In this work, we propose to inject the 3D world into large\nlanguage models and introduce a whole new family of 3D-LLMs. Specifically,\n3D-LLMs can take 3D point clouds and their features as input and perform a\ndiverse set of 3D-related tasks, including captioning, dense captioning, 3D\nquestion answering, task decomposition, 3D grounding, 3D-assisted dialog,\nnavigation, and so on. Using three types of prompting mechanisms that we\ndesign, we are able to collect over 300k 3D-language data covering these tasks.\nTo efficiently train 3D-LLMs, we first utilize a 3D feature extractor that\nobtains 3D features from rendered multi- view images. Then, we use 2D VLMs as\nour backbones to train our 3D-LLMs. By introducing a 3D localization mechanism,\n3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show\nthat our model outperforms state-of-the-art baselines by a large margin (e.g.,\nthe BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore,\nexperiments on our held-in datasets for 3D captioning, task composition, and\n3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative\nexamples also show that our model could perform more tasks beyond the scope of\nexisting LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.\n","authors":["Yining Hong","Haoyu Zhen","Peihao Chen","Shuhong Zheng","Yilun Du","Zhenfang Chen","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2307.12981v1.pdf","comment":"Project Page: : https://vis-www.cs.umass.edu/3dllm/"},{"id":"http://arxiv.org/abs/2209.05407v3","updated":"2023-07-24T17:58:31Z","published":"2022-09-12T16:59:36Z","title":"Segmenting Known Objects and Unseen Unknowns without Prior Knowledge","summary":"  Panoptic segmentation methods assign a known class to each pixel given in\ninput. Even for state-of-the-art approaches, this inevitably enforces decisions\nthat systematically lead to wrong predictions for objects outside the training\ncategories. However, robustness against out-of-distribution samples and corner\ncases is crucial in safety-critical settings to avoid dangerous consequences.\nSince real-world datasets cannot contain enough data points to adequately\nsample the long tail of the underlying distribution, models must be able to\ndeal with unseen and unknown scenarios as well. Previous methods targeted this\nby re-identifying already-seen unlabeled objects. In this work, we propose the\nnecessary step to extend segmentation with a new setting which we term holistic\nsegmentation. Holistic segmentation aims to identify and separate objects of\nunseen unknown categories into instances, without any prior knowledge about\nthem, while performing panoptic segmentation of known classes. We tackle this\nnew problem with U3HS, which finds unknowns as highly uncertain regions and\nclusters their corresponding instance-aware embeddings into individual objects.\nBy doing so, for the first time in panoptic segmentation with unknown objects,\nour U3HS is trained without unknown categories, reducing assumptions and\nleaving the settings as unconstrained as in real-life scenarios. Extensive\nexperiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate\nthe effectiveness of U3HS for this new, challenging, and assumptions-free\nsetting called holistic segmentation.\n","authors":["Stefano Gasperini","Alvaro Marcos-Ramiro","Michael Schmidt","Nassir Navab","Benjamin Busam","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2209.05407v3.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12980v1","updated":"2023-07-24T17:58:06Z","published":"2023-07-24T17:58:06Z","title":"A Systematic Survey of Prompt Engineering on Vision-Language Foundation\n  Models","summary":"  Prompt engineering is a technique that involves augmenting a large\npre-trained model with task-specific hints, known as prompts, to adapt the\nmodel to new tasks. Prompts can be created manually as natural language\ninstructions or generated automatically as either natural language instructions\nor vector representations. Prompt engineering enables the ability to perform\npredictions based solely on prompts without updating model parameters, and the\neasier application of large pre-trained models in real-world tasks. In past\nyears, Prompt engineering has been well-studied in natural language processing.\nRecently, it has also been intensively studied in vision-language modeling.\nHowever, there is currently a lack of a systematic overview of prompt\nengineering on pre-trained vision-language models. This paper aims to provide a\ncomprehensive survey of cutting-edge research in prompt engineering on three\ntypes of vision-language models: multimodal-to-text generation models (e.g.\nFlamingo), image-text matching models (e.g. CLIP), and text-to-image generation\nmodels (e.g. Stable Diffusion). For each type of model, a brief model summary,\nprompting methods, prompting-based applications, and the corresponding\nresponsibility and integrity issues are summarized and discussed. Furthermore,\nthe commonalities and differences between prompting on vision-language models,\nlanguage models, and vision models are also discussed. The challenges, future\ndirections, and research opportunities are summarized to foster future research\non this topic.\n","authors":["Jindong Gu","Zhen Han","Shuo Chen","Ahmad Beirami","Bailan He","Gengyuan Zhang","Ruotong Liao","Yao Qin","Volker Tresp","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2307.12980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12972v1","updated":"2023-07-24T17:49:11Z","published":"2023-07-24T17:49:11Z","title":"DFA3D: 3D Deformable Attention For 2D-to-3D Feature Lifting","summary":"  In this paper, we propose a new operator, called 3D DeFormable Attention\n(DFA3D), for 2D-to-3D feature lifting, which transforms multi-view 2D image\nfeatures into a unified 3D space for 3D object detection. Existing feature\nlifting approaches, such as Lift-Splat-based and 2D attention-based, either use\nestimated depth to get pseudo LiDAR features and then splat them to a 3D space,\nwhich is a one-pass operation without feature refinement, or ignore depth and\nlift features by 2D attention mechanisms, which achieve finer semantics while\nsuffering from a depth ambiguity problem. In contrast, our DFA3D-based method\nfirst leverages the estimated depth to expand each view's 2D feature map to 3D\nand then utilizes DFA3D to aggregate features from the expanded 3D feature\nmaps. With the help of DFA3D, the depth ambiguity problem can be effectively\nalleviated from the root, and the lifted features can be progressively refined\nlayer by layer, thanks to the Transformer-like architecture. In addition, we\npropose a mathematically equivalent implementation of DFA3D which can\nsignificantly improve its memory efficiency and computational speed. We\nintegrate DFA3D into several methods that use 2D attention-based feature\nlifting with only a few modifications in code and evaluate on the nuScenes\ndataset. The experiment results show a consistent improvement of +1.41\\% mAP on\naverage, and up to +15.1\\% mAP improvement when high-quality depth information\nis available, demonstrating the superiority, applicability, and huge potential\nof DFA3D. The code is available at\nhttps://github.com/IDEA-Research/3D-deformable-attention.git.\n","authors":["Hongyang Li","Hao Zhang","Zhaoyang Zeng","Shilong Liu","Feng Li","Tianhe Ren","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12970v1","updated":"2023-07-24T17:49:04Z","published":"2023-07-24T17:49:04Z","title":"Volcanic ash delimitation using Artificial Intelligence based on Pix2Pix","summary":"  Volcanic eruptions emit ash that can be harmful to human health and cause\ndamage to infrastructure, economic activities and the environment. The\ndelimitation of ash clouds allows to know their behavior and dispersion, which\nhelps in the prevention and mitigation of this phenomenon. Traditional methods\ntake advantage of specialized software programs to process the bands or\nchannels that compose the satellite images. However, their use is limited to\nexperts and demands a lot of time and significant computational resources. In\nrecent years, Artificial Intelligence has been a milestone in the computational\ntreatment of complex problems in different areas. In particular, Deep Learning\ntechniques allow automatic, fast and accurate processing of digital images. The\npresent work proposes the use of the Pix2Pix model, a type of generative\nadversarial network that, once trained, learns the mapping of input images to\noutput images. The architecture of such a network consisting of a generator and\na discriminator provides the versatility needed to produce black and white ash\ncloud images from multispectral satellite images. The evaluation of the model,\nbased on loss and accuracy plots, a confusion matrix, and visual inspection,\nindicates a satisfactory solution for accurate ash cloud delineation,\napplicable in any area of the world and becomes a useful tool in risk\nmanagement.\n","authors":["Christian Carrillo","Gissela Torres","Christian Mejia-Escobar"],"pdf_url":"https://arxiv.org/pdf/2307.12970v1.pdf","comment":"18 pages, in Spanish language, 15 figures"},{"id":"http://arxiv.org/abs/2307.12967v1","updated":"2023-07-24T17:45:40Z","published":"2023-07-24T17:45:40Z","title":"Learning Dense Correspondences between Photos and Sketches","summary":"  Humans effortlessly grasp the connection between sketches and real-world\nobjects, even when these sketches are far from realistic. Moreover, human\nsketch understanding goes beyond categorization -- critically, it also entails\nunderstanding how individual elements within a sketch correspond to parts of\nthe physical world it represents. What are the computational ingredients needed\nto support this ability? Towards answering this question, we make two\ncontributions: first, we introduce a new sketch-photo correspondence benchmark,\n$\\textit{PSC6k}$, containing 150K annotations of 6250 sketch-photo pairs across\n125 object categories, augmenting the existing Sketchy dataset with\nfine-grained correspondence metadata. Second, we propose a self-supervised\nmethod for learning dense correspondences between sketch-photo pairs, building\nupon recent advances in correspondence learning for pairs of photos. Our model\nuses a spatial transformer network to estimate the warp flow between latent\nrepresentations of a sketch and photo extracted by a contrastive learning-based\nConvNet backbone. We found that this approach outperformed several strong\nbaselines and produced predictions that were quantitatively consistent with\nother warp-based methods. However, our benchmark also revealed systematic\ndifferences between predictions of the suite of models we tested and those of\nhumans. Taken together, our work suggests a promising path towards developing\nartificial systems that achieve more human-like understanding of visual images\nat different levels of abstraction. Project page:\nhttps://photo-sketch-correspondence.github.io\n","authors":["Xuanchen Lu","Xiaolong Wang","Judith E Fan"],"pdf_url":"https://arxiv.org/pdf/2307.12967v1.pdf","comment":"Accepted to ICML 2023. Project page:\n  https://photo-sketch-correspondence.github.io"},{"id":"http://arxiv.org/abs/2307.12964v1","updated":"2023-07-24T17:43:13Z","published":"2023-07-24T17:43:13Z","title":"Audio-Enhanced Text-to-Video Retrieval using Text-Conditioned Feature\n  Alignment","summary":"  Text-to-video retrieval systems have recently made significant progress by\nutilizing pre-trained models trained on large-scale image-text pairs. However,\nmost of the latest methods primarily focus on the video modality while\ndisregarding the audio signal for this task. Nevertheless, a recent advancement\nby ECLIPSE has improved long-range text-to-video retrieval by developing an\naudiovisual video representation. Nonetheless, the objective of the\ntext-to-video retrieval task is to capture the complementary audio and video\ninformation that is pertinent to the text query rather than simply achieving\nbetter audio and video alignment. To address this issue, we introduce TEFAL, a\nTExt-conditioned Feature ALignment method that produces both audio and video\nrepresentations conditioned on the text query. Instead of using only an\naudiovisual attention block, which could suppress the audio information\nrelevant to the text query, our approach employs two independent cross-modal\nattention blocks that enable the text to attend to the audio and video\nrepresentations separately. Our proposed method's efficacy is demonstrated on\nfour benchmark datasets that include audio: MSR-VTT, LSMDC, VATEX, and\nCharades, and achieves better than state-of-the-art performance consistently\nacross the four datasets. This is attributed to the additional\ntext-query-conditioned audio representation and the complementary information\nit adds to the text-query-conditioned video representation.\n","authors":["Sarah Ibrahimi","Xiaohang Sun","Pichao Wang","Amanmeet Garg","Ashutosh Sanan","Mohamed Omar"],"pdf_url":"https://arxiv.org/pdf/2307.12964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12941v1","updated":"2023-07-24T17:11:39Z","published":"2023-07-24T17:11:39Z","title":"On Privileged and Convergent Bases in Neural Network Representations","summary":"  In this study, we investigate whether the representations learned by neural\nnetworks possess a privileged and convergent basis. Specifically, we examine\nthe significance of feature directions represented by individual neurons.\nFirst, we establish that arbitrary rotations of neural representations cannot\nbe inverted (unlike linear networks), indicating that they do not exhibit\ncomplete rotational invariance. Subsequently, we explore the possibility of\nmultiple bases achieving identical performance. To do this, we compare the\nbases of networks trained with the same parameters but with varying random\ninitializations. Our study reveals two findings: (1) Even in wide networks such\nas WideResNets, neural networks do not converge to a unique basis; (2) Basis\ncorrelation increases significantly when a few early layers of the network are\nfrozen identically.\n  Furthermore, we analyze Linear Mode Connectivity, which has been studied as a\nmeasure of basis correlation. Our findings give evidence that while Linear Mode\nConnectivity improves with increased network width, this improvement is not due\nto an increase in basis correlation.\n","authors":["Davis Brown","Nikhil Vyas","Yamini Bansal"],"pdf_url":"https://arxiv.org/pdf/2307.12941v1.pdf","comment":"In the Workshop on High-dimensional Learning Dynamics at ICML 2023"},{"id":"http://arxiv.org/abs/2307.12917v1","updated":"2023-07-24T16:18:22Z","published":"2023-07-24T16:18:22Z","title":"Hierarchical Skeleton Meta-Prototype Contrastive Learning with Hard\n  Skeleton Mining for Unsupervised Person Re-Identification","summary":"  With rapid advancements in depth sensors and deep learning, skeleton-based\nperson re-identification (re-ID) models have recently achieved remarkable\nprogress with many advantages. Most existing solutions learn single-level\nskeleton features from body joints with the assumption of equal skeleton\nimportance, while they typically lack the ability to exploit more informative\nskeleton features from various levels such as limb level with more global body\npatterns. The label dependency of these methods also limits their flexibility\nin learning more general skeleton representations. This paper proposes a\ngeneric unsupervised Hierarchical skeleton Meta-Prototype Contrastive learning\n(Hi-MPC) approach with Hard Skeleton Mining (HSM) for person re-ID with\nunlabeled 3D skeletons. Firstly, we construct hierarchical representations of\nskeletons to model coarse-to-fine body and motion features from the levels of\nbody joints, components, and limbs. Then a hierarchical meta-prototype\ncontrastive learning model is proposed to cluster and contrast the most typical\nskeleton features (\"prototypes\") from different-level skeletons. By converting\noriginal prototypes into meta-prototypes with multiple homogeneous\ntransformations, we induce the model to learn the inherent consistency of\nprototypes to capture more effective skeleton features for person re-ID.\nFurthermore, we devise a hard skeleton mining mechanism to adaptively infer the\ninformative importance of each skeleton, so as to focus on harder skeletons to\nlearn more discriminative skeleton representations. Extensive evaluations on\nfive datasets demonstrate that our approach outperforms a wide variety of\nstate-of-the-art skeleton-based methods. We further show the general\napplicability of our method to cross-view person re-ID and RGB-based scenarios\nwith estimated skeletons.\n","authors":["Haocong Rao","Cyril Leung","Chunyan Miao"],"pdf_url":"https://arxiv.org/pdf/2307.12917v1.pdf","comment":"Accepted by International Journal of Computer Vision (IJCV). Codes\n  are available at https://github.com/Kali-Hac/Hi-MPC. Supplemental materials\n  will be included in the published version"},{"id":"http://arxiv.org/abs/2307.12914v1","updated":"2023-07-24T16:13:43Z","published":"2023-07-24T16:13:43Z","title":"Towards a Visual-Language Foundation Model for Computational Pathology","summary":"  The accelerated adoption of digital pathology and advances in deep learning\nhave enabled the development of powerful models for various pathology tasks\nacross a diverse array of diseases and patient cohorts. However, model training\nis often difficult due to label scarcity in the medical domain and the model's\nusage is limited by the specific task and disease for which it is trained.\nAdditionally, most models in histopathology leverage only image data, a stark\ncontrast to how humans teach each other and reason about histopathologic\nentities. We introduce CONtrastive learning from Captions for Histopathology\n(CONCH), a visual-language foundation model developed using diverse sources of\nhistopathology images, biomedical text, and notably over 1.17 million\nimage-caption pairs via task-agnostic pretraining. Evaluated on a suite of 13\ndiverse benchmarks, CONCH can be transferred to a wide range of downstream\ntasks involving either or both histopathology images and text, achieving\nstate-of-the-art performance on histology image classification, segmentation,\ncaptioning, text-to-image and image-to-text retrieval. CONCH represents a\nsubstantial leap over concurrent visual-language pretrained systems for\nhistopathology, with the potential to directly facilitate a wide array of\nmachine learning-based workflows requiring minimal or no further supervised\nfine-tuning.\n","authors":["Ming Y. Lu","Bowen Chen","Drew F. K. Williamson","Richard J. Chen","Ivy Liang","Tong Ding","Guillaume Jaume","Igor Odintsov","Andrew Zhang","Long Phi Le","Georg Gerber","Anil V Parwani","Faisal Mahmood"],"pdf_url":"https://arxiv.org/pdf/2307.12914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12909v1","updated":"2023-07-24T16:08:32Z","published":"2023-07-24T16:08:32Z","title":"Dyn-E: Local Appearance Editing of Dynamic Neural Radiance Fields","summary":"  Recently, the editing of neural radiance fields (NeRFs) has gained\nconsiderable attention, but most prior works focus on static scenes while\nresearch on the appearance editing of dynamic scenes is relatively lacking. In\nthis paper, we propose a novel framework to edit the local appearance of\ndynamic NeRFs by manipulating pixels in a single frame of training video.\nSpecifically, to locally edit the appearance of dynamic NeRFs while preserving\nunedited regions, we introduce a local surface representation of the edited\nregion, which can be inserted into and rendered along with the original NeRF\nand warped to arbitrary other frames through a learned invertible motion\nrepresentation network. By employing our method, users without professional\nexpertise can easily add desired content to the appearance of a dynamic scene.\nWe extensively evaluate our approach on various scenes and show that our\napproach achieves spatially and temporally consistent editing results. Notably,\nour approach is versatile and applicable to different variants of dynamic NeRF\nrepresentations.\n","authors":["Shangzhan Zhang","Sida Peng","Yinji ShenTu","Qing Shuai","Tianrun Chen","Kaicheng Yu","Hujun Bao","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.12909v1.pdf","comment":"project page: https://dyn-e.github.io/"},{"id":"http://arxiv.org/abs/2307.12907v1","updated":"2023-07-24T16:02:42Z","published":"2023-07-24T16:02:42Z","title":"GridMM: Grid Memory Map for Vision-and-Language Navigation","summary":"  Vision-and-language navigation (VLN) enables the agent to navigate to a\nremote location following the natural language instruction in 3D environments.\nTo represent the previously visited environment, most approaches for VLN\nimplement memory using recurrent states, topological maps, or top-down semantic\nmaps. In contrast to these approaches, we build the top-down egocentric and\ndynamically growing Grid Memory Map (i.e., GridMM) to structure the visited\nenvironment. From a global perspective, historical observations are projected\ninto a unified grid map in a top-down view, which can better represent the\nspatial relations of the environment. From a local perspective, we further\npropose an instruction relevance aggregation method to capture fine-grained\nvisual clues in each grid region. Extensive experiments are conducted on both\nthe REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE\ndataset in the continuous environments, showing the superiority of our proposed\nmethod.\n","authors":["Zihan Wang","Xiangyang Li","Jiahao Yang","Yeqi Liu","Shuqiang Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.12907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12900v1","updated":"2023-07-24T15:47:21Z","published":"2023-07-24T15:47:21Z","title":"Automotive Object Detection via Learning Sparse Events by Temporal\n  Dynamics of Spiking Neurons","summary":"  Event-based sensors, with their high temporal resolution (1us) and dynamical\nrange (120dB), have the potential to be deployed in high-speed platforms such\nas vehicles and drones. However, the highly sparse and fluctuating nature of\nevents poses challenges for conventional object detection techniques based on\nArtificial Neural Networks (ANNs). In contrast, Spiking Neural Networks (SNNs)\nare well-suited for representing event-based data due to their inherent\ntemporal dynamics. In particular, we demonstrate that the membrane potential\ndynamics can modulate network activity upon fluctuating events and strengthen\nfeatures of sparse input. In addition, the spike-triggered adaptive threshold\ncan stabilize training which further improves network performance. Based on\nthis, we develop an efficient spiking feature pyramid network for event-based\nobject detection. Our proposed SNN outperforms previous SNNs and sophisticated\nANNs with attention mechanisms, achieving a mean average precision (map50) of\n47.7% on the Gen1 benchmark dataset. This result significantly surpasses the\nprevious best SNN by 9.7% and demonstrates the potential of SNNs for\nevent-based vision. Our model has a concise architecture while maintaining high\naccuracy and much lower computation cost as a result of sparse computation. Our\ncode will be publicly available.\n","authors":["Hu Zhang","Luziwei Leng","Kaiwei Che","Qian Liu","Jie Cheng","Qinghai Guo","Jiangxing Liao","Ran Cheng"],"pdf_url":"https://arxiv.org/pdf/2307.12900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.12803v3","updated":"2023-07-24T15:27:16Z","published":"2022-01-30T12:53:51Z","title":"Generalizing similarity in noisy setups: the DIBS phenomenon","summary":"  This work uncovers an interplay among data density, noise, and the\ngeneralization ability in similarity learning. We consider Siamese Neural\nNetworks (SNNs), which are the basic form of contrastive learning, and explore\ntwo types of noise that can impact SNNs, Pair Label Noise (PLN) and Single\nLabel Noise (SLN). Our investigation reveals that SNNs exhibit double descent\nbehaviour regardless of the training setup and that it is further exacerbated\nby noise. We demonstrate that the density of data pairs is crucial for\ngeneralization. When SNNs are trained on sparse datasets with the same amount\nof PLN or SLN, they exhibit comparable generalization properties. However, when\nusing dense datasets, PLN cases generalize worse than SLN ones in the\noverparametrized region, leading to a phenomenon we call Density-Induced Break\nof Similarity (DIBS). In this regime, PLN similarity violation becomes\nmacroscopical, corrupting the dataset to the point where complete interpolation\ncannot be achieved, regardless of the number of model parameters. Our analysis\nalso delves into the correspondence between online optimization and offline\ngeneralization in similarity learning. The results show that this equivalence\nfails in the presence of label noise in all the scenarios considered.\n","authors":["Nayara Fonseca","Veronica Guidetti"],"pdf_url":"https://arxiv.org/pdf/2201.12803v3.pdf","comment":"v3: version accepted at ECAI 2023 + Supplementary Material"},{"id":"http://arxiv.org/abs/2307.12872v1","updated":"2023-07-24T15:10:22Z","published":"2023-07-24T15:10:22Z","title":"Data-free Black-box Attack based on Diffusion Model","summary":"  Since the training data for the target model in a data-free black-box attack\nis not available, most recent schemes utilize GANs to generate data for\ntraining substitute model. However, these GANs-based schemes suffer from low\ntraining efficiency as the generator needs to be retrained for each target\nmodel during the substitute training process, as well as low generation\nquality. To overcome these limitations, we consider utilizing the diffusion\nmodel to generate data, and propose a data-free black-box attack scheme based\non diffusion model to improve the efficiency and accuracy of substitute\ntraining. Despite the data generated by the diffusion model exhibits high\nquality, it presents diverse domain distributions and contains many samples\nthat do not meet the discriminative criteria of the target model. To further\nfacilitate the diffusion model to generate data suitable for the target model,\nwe propose a Latent Code Augmentation (LCA) method to guide the diffusion model\nin generating data. With the guidance of LCA, the data generated by the\ndiffusion model not only meets the discriminative criteria of the target model\nbut also exhibits high diversity. By utilizing this data, it is possible to\ntrain substitute model that closely resemble the target model more efficiently.\nExtensive experiments demonstrate that our LCA achieves higher attack success\nrates and requires fewer query budgets compared to GANs-based schemes for\ndifferent target models.\n","authors":["Mingwen Shao","Lingzhuang Meng","Yuanjian Qiao","Lixu Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2307.12872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12868v1","updated":"2023-07-24T15:06:42Z","published":"2023-07-24T15:06:42Z","title":"Understanding the Latent Space of Diffusion Models through the Lens of\n  Riemannian Geometry","summary":"  Despite the success of diffusion models (DMs), we still lack a thorough\nunderstanding of their latent space. To understand the latent space\n$\\mathbf{x}_t \\in \\mathcal{X}$, we analyze them from a geometrical perspective.\nSpecifically, we utilize the pullback metric to find the local latent basis in\n$\\mathcal{X}$ and their corresponding local tangent basis in $\\mathcal{H}$, the\nintermediate feature maps of DMs. The discovered latent basis enables\nunsupervised image editing capability through latent space traversal. We\ninvestigate the discovered structure from two perspectives. First, we examine\nhow geometric structure evolves over diffusion timesteps. Through analysis, we\nshow that 1) the model focuses on low-frequency components early in the\ngenerative process and attunes to high-frequency details later; 2) At early\ntimesteps, different samples share similar tangent spaces; and 3) The simpler\ndatasets that DMs trained on, the more consistent the tangent space for each\ntimestep. Second, we investigate how the geometric structure changes based on\ntext conditioning in Stable Diffusion. The results show that 1) similar prompts\nyield comparable tangent spaces; and 2) the model depends less on text\nconditions in later timesteps. To the best of our knowledge, this paper is the\nfirst to present image editing through $\\mathbf{x}$-space traversal and provide\nthorough analyses of the latent structure of DMs.\n","authors":["Yong-Hyun Park","Mingi Kwon","Jaewoong Choi","Junghyo Jo","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2307.12868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09224v2","updated":"2023-07-24T15:05:55Z","published":"2023-06-15T16:03:01Z","title":"Encyclopedic VQA: Visual questions about detailed properties of\n  fine-grained categories","summary":"  We propose Encyclopedic-VQA, a large scale visual question answering (VQA)\ndataset featuring visual questions about detailed properties of fine-grained\ncategories and instances. It contains 221k unique question+answer pairs each\nmatched with (up to) 5 images, resulting in a total of 1M VQA samples.\nMoreover, our dataset comes with a controlled knowledge base derived from\nWikipedia, marking the evidence to support each answer. Empirically, we show\nthat our dataset poses a hard challenge for large vision+language models as\nthey perform poorly on our dataset: PaLI [14] is state-of-the-art on OK-VQA\n[37], yet it only achieves 13.0% accuracy on our dataset. Moreover, we\nexperimentally show that progress on answering our encyclopedic questions can\nbe achieved by augmenting large models with a mechanism that retrieves relevant\ninformation from the knowledge base. An oracle experiment with perfect\nretrieval achieves 87.0% accuracy on the single-hop portion of our dataset, and\nan automatic retrieval-augmented prototype yields 48.8%. We believe that our\ndataset enables future research on retrieval-augmented vision+language models.\nIt is available at\nhttps://github.com/google-research/google-research/tree/master/encyclopedic_vqa .\n","authors":["Thomas Mensink","Jasper Uijlings","Lluis Castrejon","Arushi Goel","Felipe Cadar","Howard Zhou","Fei Sha","André Araujo","Vittorio Ferrari"],"pdf_url":"https://arxiv.org/pdf/2306.09224v2.pdf","comment":"ICCV'23"},{"id":"http://arxiv.org/abs/2307.12858v1","updated":"2023-07-24T14:57:40Z","published":"2023-07-24T14:57:40Z","title":"Treatment Outcome Prediction for Intracerebral Hemorrhage via Generative\n  Prognostic Model with Imaging and Tabular Data","summary":"  Intracerebral hemorrhage (ICH) is the second most common and deadliest form\nof stroke. Despite medical advances, predicting treat ment outcomes for ICH\nremains a challenge. This paper proposes a novel prognostic model that utilizes\nboth imaging and tabular data to predict treatment outcome for ICH. Our model\nis trained on observational data collected from non-randomized controlled\ntrials, providing reliable predictions of treatment success. Specifically, we\npropose to employ a variational autoencoder model to generate a low-dimensional\nprognostic score, which can effectively address the selection bias resulting\nfrom the non-randomized controlled trials. Importantly, we develop a\nvariational distributions combination module that combines the information from\nimaging data, non-imaging clinical data, and treatment assignment to accurately\ngenerate the prognostic score. We conducted extensive experiments on a\nreal-world clinical dataset of intracerebral hemorrhage. Our proposed method\ndemonstrates a substantial improvement in treatment outcome prediction compared\nto existing state-of-the-art approaches. Code is available at\nhttps://github.com/med-air/TOP-GPM\n","authors":["Wenao Ma","Cheng Chen","Jill Abrigo","Calvin Hoi-Kwan Mak","Yuqi Gong","Nga Yan Chan","Chu Han","Zaiyi Liu","Qi Dou"],"pdf_url":"https://arxiv.org/pdf/2307.12858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12854v1","updated":"2023-07-24T14:55:15Z","published":"2023-07-24T14:55:15Z","title":"Multiscale Video Pretraining for Long-Term Activity Forecasting","summary":"  Long-term activity forecasting is an especially challenging research problem\nbecause it requires understanding the temporal relationships between observed\nactions, as well as the variability and complexity of human activities. Despite\nrelying on strong supervision via expensive human annotations, state-of-the-art\nforecasting approaches often generalize poorly to unseen data. To alleviate\nthis issue, we propose Multiscale Video Pretraining (MVP), a novel\nself-supervised pretraining approach that learns robust representations for\nforecasting by learning to predict contextualized representations of future\nvideo clips over multiple timescales. MVP is based on our observation that\nactions in videos have a multiscale nature, where atomic actions typically\noccur at a short timescale and more complex actions may span longer timescales.\nWe compare MVP to state-of-the-art self-supervised video learning approaches on\ndownstream long-term forecasting tasks including long-term action anticipation\nand video summary prediction. Our comprehensive experiments across the Ego4D\nand Epic-Kitchens-55/100 datasets demonstrate that MVP out-performs\nstate-of-the-art methods by significant margins. Notably, MVP obtains a\nrelative performance gain of over 20% accuracy in video summary forecasting\nover existing methods.\n","authors":["Reuben Tan","Matthias De Lange","Michael Iuzzolino","Bryan A. Plummer","Kate Saenko","Karl Ridgeway","Lorenzo Torresani"],"pdf_url":"https://arxiv.org/pdf/2307.12854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11630v3","updated":"2023-07-24T14:53:51Z","published":"2023-03-21T06:54:18Z","title":"BoxSnake: Polygonal Instance Segmentation with Box Supervision","summary":"  Box-supervised instance segmentation has gained much attention as it requires\nonly simple box annotations instead of costly mask or polygon annotations.\nHowever, existing box-supervised instance segmentation models mainly focus on\nmask-based frameworks. We propose a new end-to-end training technique, termed\nBoxSnake, to achieve effective polygonal instance segmentation using only box\nannotations for the first time. Our method consists of two loss functions: (1)\na point-based unary loss that constrains the bounding box of predicted polygons\nto achieve coarse-grained segmentation; and (2) a distance-aware pairwise loss\nthat encourages the predicted polygons to fit the object boundaries. Compared\nwith the mask-based weakly-supervised methods, BoxSnake further reduces the\nperformance gap between the predicted segmentation and the bounding box, and\nshows significant superiority on the Cityscapes dataset. The code has been\navailable publicly.\n","authors":["Rui Yang","Lin Song","Yixiao Ge","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2303.11630v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12853v1","updated":"2023-07-24T14:53:23Z","published":"2023-07-24T14:53:23Z","title":"Spatiotemporal Modeling Encounters 3D Medical Image Analysis:\n  Slice-Shift UNet with Multi-View Fusion","summary":"  As a fundamental part of computational healthcare, Computer Tomography (CT)\nand Magnetic Resonance Imaging (MRI) provide volumetric data, making the\ndevelopment of algorithms for 3D image analysis a necessity. Despite being\ncomputationally cheap, 2D Convolutional Neural Networks can only extract\nspatial information. In contrast, 3D CNNs can extract three-dimensional\nfeatures, but they have higher computational costs and latency, which is a\nlimitation for clinical practice that requires fast and efficient models.\nInspired by the field of video action recognition we propose a new 2D-based\nmodel dubbed Slice SHift UNet (SSH-UNet) which encodes three-dimensional\nfeatures at 2D CNN's complexity. More precisely multi-view features are\ncollaboratively learned by performing 2D convolutions along the three\northogonal planes of a volume and imposing a weights-sharing mechanism. The\nthird dimension, which is neglected by the 2D convolution, is reincorporated by\nshifting a portion of the feature maps along the slices' axis. The\neffectiveness of our approach is validated in Multi-Modality Abdominal\nMulti-Organ Segmentation (AMOS) and Multi-Atlas Labeling Beyond the Cranial\nVault (BTCV) datasets, showing that SSH-UNet is more efficient while on par in\nperformance with state-of-the-art architectures.\n","authors":["C. I. Ugwu","S. Casarin","O. Lanz"],"pdf_url":"https://arxiv.org/pdf/2307.12853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12845v1","updated":"2023-07-24T14:43:07Z","published":"2023-07-24T14:43:07Z","title":"Multi-View Vertebra Localization and Identification from CT Images","summary":"  Accurately localizing and identifying vertebrae from CT images is crucial for\nvarious clinical applications. However, most existing efforts are performed on\n3D with cropping patch operation, suffering from the large computation costs\nand limited global information. In this paper, we propose a multi-view vertebra\nlocalization and identification from CT images, converting the 3D problem into\na 2D localization and identification task on different views. Without the\nlimitation of the 3D cropped patch, our method can learn the multi-view global\ninformation naturally. Moreover, to better capture the anatomical structure\ninformation from different view perspectives, a multi-view contrastive learning\nstrategy is developed to pre-train the backbone. Additionally, we further\npropose a Sequence Loss to maintain the sequential structure embedded along the\nvertebrae. Evaluation results demonstrate that, with only two 2D networks, our\nmethod can localize and identify vertebrae in CT images accurately, and\noutperforms the state-of-the-art methods consistently. Our code is available at\nhttps://github.com/ShanghaiTech-IMPACT/Multi-View-Vertebra-Localization-and-Identification-from-CT-Images.\n","authors":["Han Wu","Jiadong Zhang","Yu Fang","Zhentao Liu","Nizhuan Wang","Zhiming Cui","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2307.12845v1.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2306.15599v2","updated":"2023-07-24T14:41:40Z","published":"2023-06-27T16:37:37Z","title":"Coupling a Recurrent Neural Network to SPAD TCSPC Systems for Real-time\n  Fluorescence Lifetime Imaging","summary":"  Fluorescence lifetime imaging (FLI) has been receiving increased attention in\nrecent years as a powerful diagnostic technique in biological and medical\nresearch. However, existing FLI systems often suffer from a tradeoff between\nprocessing speed, accuracy, and robustness. In this paper, we propose a robust\napproach that enables fast FLI with no degradation of accuracy. The approach is\nbased on a SPAD TCSPC system coupled to a recurrent neural network (RNN) that\naccurately estimates the fluorescence lifetime directly from raw timestamps\nwithout building histograms, thereby drastically reducing transfer data volumes\nand hardware resource utilization, thus enabling FLI acquisition at video rate.\nWe train two variants of the RNN on a synthetic dataset and compare the results\nto those obtained using center-of-mass method (CMM) and least squares fitting\n(LS fitting). Results demonstrate that two RNN variants, gated recurrent unit\n(GRU) and long short-term memory (LSTM), are comparable to CMM and LS fitting\nin terms of accuracy, while outperforming them in background noise by a large\nmargin. To explore the ultimate limits of the approach, we derived the\nCramer-Rao lower bound of the measurement, showing that RNN yields lifetime\nestimations with near-optimal precision. Moreover, our FLI model, which is\npurely trained on synthetic datasets, works well with never-seen-before,\nreal-world data. To demonstrate real-time operation, we have built a FLI\nmicroscope based on Piccolo, a 32x32 SPAD sensor developed in our lab. Four\nquantized GRU cores, capable of processing up to 4 million photons per second,\nare deployed on a Xilinx Kintex-7 FPGA. Powered by the GRU, the FLI setup can\nretrieve real-time fluorescence lifetime images at up to 10 frames per second.\nThe proposed FLI system is promising and ideally suited for biomedical\napplications.\n","authors":["Yang Lin","Paul Mos","Andrei Ardelean","Claudio Bruschini","Edoardo Charbon"],"pdf_url":"https://arxiv.org/pdf/2306.15599v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09696v2","updated":"2023-07-24T14:36:24Z","published":"2023-07-19T00:41:39Z","title":"Towards Saner Deep Image Registration","summary":"  With recent advances in computing hardware and surges of deep-learning\narchitectures, learning-based deep image registration methods have surpassed\ntheir traditional counterparts, in terms of metric performance and inference\ntime. However, these methods focus on improving performance measurements such\nas Dice, resulting in less attention given to model behaviors that are equally\ndesirable for registrations, especially for medical imaging. This paper\ninvestigates these behaviors for popular learning-based deep registrations\nunder a sanity-checking microscope. We find that most existing registrations\nsuffer from low inverse consistency and nondiscrimination of identical pairs\ndue to overly optimized image similarities. To rectify these behaviors, we\npropose a novel regularization-based sanity-enforcer method that imposes two\nsanity checks on the deep model to reduce its inverse consistency errors and\nincrease its discriminative power simultaneously. Moreover, we derive a set of\ntheoretical guarantees for our sanity-checked image registration method, with\nexperimental results supporting our theoretical findings and their\neffectiveness in increasing the sanity of models without sacrificing any\nperformance. Our code and models are available at\nhttps://github.com/tuffr5/Saner-deep-registration.\n","authors":["Bin Duan","Ming Zhong","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2307.09696v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12837v1","updated":"2023-07-24T14:35:46Z","published":"2023-07-24T14:35:46Z","title":"EPIC-KITCHENS-100 Unsupervised Domain Adaptation Challenge: Mixed\n  Sequences Prediction","summary":"  This report presents the technical details of our approach for the\nEPIC-Kitchens-100 Unsupervised Domain Adaptation (UDA) Challenge in Action\nRecognition. Our approach is based on the idea that the order in which actions\nare performed is similar between the source and target domains. Based on this,\nwe generate a modified sequence by randomly combining actions from the source\nand target domains. As only unlabelled target data are available under the UDA\nsetting, we use a standard pseudo-labeling strategy for extracting action\nlabels for the target. We then ask the network to predict the resulting action\nsequence. This allows to integrate information from both domains during\ntraining and to achieve better transfer results on target. Additionally, to\nbetter incorporate sequence information, we use a language model to filter\nunlikely sequences. Lastly, we employed a co-occurrence matrix to eliminate\nunseen combinations of verbs and nouns. Our submission, labeled as 'sshayan',\ncan be found on the leaderboard, where it currently holds the 2nd position for\n'verb' and the 4th position for both 'noun' and 'action'.\n","authors":["Amirshayan Nasirimajd","Simone Alberto Peirone","Chiara Plizzari","Barbara Caputo"],"pdf_url":"https://arxiv.org/pdf/2307.12837v1.pdf","comment":"2nd place in the 2023 EPIC-KITCHENS-100 Unsupervised Domain\n  Adaptation Challenge for Action Recognition"},{"id":"http://arxiv.org/abs/2307.12822v1","updated":"2023-07-24T14:19:36Z","published":"2023-07-24T14:19:36Z","title":"Learning Provably Robust Estimators for Inverse Problems via Jittering","summary":"  Deep neural networks provide excellent performance for inverse problems such\nas denoising. However, neural networks can be sensitive to adversarial or\nworst-case perturbations. This raises the question of whether such networks can\nbe trained efficiently to be worst-case robust. In this paper, we investigate\nwhether jittering, a simple regularization technique that adds isotropic\nGaussian noise during training, is effective for learning worst-case robust\nestimators for inverse problems. While well studied for prediction in\nclassification tasks, the effectiveness of jittering for inverse problems has\nnot been systematically investigated. In this paper, we present a novel\nanalytical characterization of the optimal $\\ell_2$-worst-case robust estimator\nfor linear denoising and show that jittering yields optimal robust denoisers.\nFurthermore, we examine jittering empirically via training deep neural networks\n(U-nets) for natural image denoising, deconvolution, and accelerated magnetic\nresonance imaging (MRI). The results show that jittering significantly enhances\nthe worst-case robustness, but can be suboptimal for inverse problems beyond\ndenoising. Moreover, our results imply that training on real data which often\ncontains slight noise is somewhat robustness enhancing.\n","authors":["Anselm Krainovic","Mahdi Soltanolkotabi","Reinhard Heckel"],"pdf_url":"https://arxiv.org/pdf/2307.12822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12813v1","updated":"2023-07-24T14:06:54Z","published":"2023-07-24T14:06:54Z","title":"Exposing the Troublemakers in Described Object Detection","summary":"  Detecting objects based on language descriptions is a popular task that\nincludes Open-Vocabulary object Detection (OVD) and Referring Expression\nComprehension (REC). In this paper, we advance them to a more practical setting\ncalled Described Object Detection (DOD) by expanding category names to flexible\nlanguage expressions for OVD and overcoming the limitation of REC to only\ngrounding the pre-existing object. We establish the research foundation for DOD\ntasks by constructing a Description Detection Dataset ($D^3$), featuring\nflexible language expressions and annotating all described objects without\nomission. By evaluating previous SOTA methods on $D^3$, we find some\ntroublemakers that fail current REC, OVD, and bi-functional methods. REC\nmethods struggle with confidence scores, rejecting negative instances, and\nmulti-target scenarios, while OVD methods face constraints with long and\ncomplex descriptions. Recent bi-functional methods also do not work well on DOD\ndue to their separated training procedures and inference strategies for REC and\nOVD tasks. Building upon the aforementioned findings, we propose a baseline\nthat largely improves REC methods by reconstructing the training data and\nintroducing a binary classification sub-task, outperforming existing methods.\nData and code is available at https://github.com/shikras/d-cube.\n","authors":["Chi Xie","Zhao Zhang","Yixuan Wu","Feng Zhu","Rui Zhao","Shuang Liang"],"pdf_url":"https://arxiv.org/pdf/2307.12813v1.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2307.02148v2","updated":"2023-07-24T13:59:50Z","published":"2023-07-05T09:44:02Z","title":"Compound Attention and Neighbor Matching Network for Multi-contrast MRI\n  Super-resolution","summary":"  Multi-contrast magnetic resonance imaging (MRI) reflects information about\nhuman tissue from different perspectives and has many clinical applications. By\nutilizing the complementary information among different modalities,\nmulti-contrast super-resolution (SR) of MRI can achieve better results than\nsingle-image super-resolution. However, existing methods of multi-contrast MRI\nSR have the following shortcomings that may limit their performance: First,\nexisting methods either simply concatenate the reference and degraded features\nor exploit global feature-matching between them, which are unsuitable for\nmulti-contrast MRI SR. Second, although many recent methods employ transformers\nto capture long-range dependencies in the spatial dimension, they neglect that\nself-attention in the channel dimension is also important for low-level vision\ntasks. To address these shortcomings, we proposed a novel network architecture\nwith compound-attention and neighbor matching (CANM-Net) for multi-contrast MRI\nSR: The compound self-attention mechanism effectively captures the dependencies\nin both spatial and channel dimension; the neighborhood-based feature-matching\nmodules are exploited to match degraded features and adjacent reference\nfeatures and then fuse them to obtain the high-quality images. We conduct\nexperiments of SR tasks on the IXI, fastMRI, and real-world scanning datasets.\nThe CANM-Net outperforms state-of-the-art approaches in both retrospective and\nprospective experiments. Moreover, the robustness study in our work shows that\nthe CANM-Net still achieves good performance when the reference and degraded\nimages are imperfectly registered, proving good potential in clinical\napplications.\n","authors":["Wenxuan Chen","Sirui Wu","Shuai Wang","Zhongsen Li","Jia Yang","Huifeng Yao","Xiaomeng Li","Xiaolei Song"],"pdf_url":"https://arxiv.org/pdf/2307.02148v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2211.16761v3","updated":"2023-07-24T13:53:26Z","published":"2022-11-30T05:59:23Z","title":"Improving Cross-Modal Retrieval with Set of Diverse Embeddings","summary":"  Cross-modal retrieval across image and text modalities is a challenging task\ndue to its inherent ambiguity: An image often exhibits various situations, and\na caption can be coupled with diverse images. Set-based embedding has been\nstudied as a solution to this problem. It seeks to encode a sample into a set\nof different embedding vectors that capture different semantics of the sample.\nIn this paper, we present a novel set-based embedding method, which is distinct\nfrom previous work in two aspects. First, we present a new similarity function\ncalled smooth-Chamfer similarity, which is designed to alleviate the side\neffects of existing similarity functions for set-based embedding. Second, we\npropose a novel set prediction module to produce a set of embedding vectors\nthat effectively captures diverse semantics of input by the slot attention\nmechanism. Our method is evaluated on the COCO and Flickr30K datasets across\ndifferent visual backbones, where it outperforms existing methods including\nones that demand substantially larger computation at inference.\n","authors":["Dongwon Kim","Namyup Kim","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2211.16761v3.pdf","comment":"Accepted to CVPR 2023 (Highlight)"},{"id":"http://arxiv.org/abs/2307.12790v1","updated":"2023-07-24T13:39:21Z","published":"2023-07-24T13:39:21Z","title":"Compact & Capable: Harnessing Graph Neural Networks and Edge Convolution\n  for Medical Image Classification","summary":"  Graph-based neural network models are gaining traction in the field of\nrepresentation learning due to their ability to uncover latent topological\nrelationships between entities that are otherwise challenging to identify.\nThese models have been employed across a diverse range of domains, encompassing\ndrug discovery, protein interactions, semantic segmentation, and fluid dynamics\nresearch. In this study, we investigate the potential of Graph Neural Networks\n(GNNs) for medical image classification. We introduce a novel model that\ncombines GNNs and edge convolution, leveraging the interconnectedness of RGB\nchannel feature values to strongly represent connections between crucial graph\nnodes. Our proposed model not only performs on par with state-of-the-art Deep\nNeural Networks (DNNs) but does so with 1000 times fewer parameters, resulting\nin reduced training time and data requirements. We compare our Graph\nConvolutional Neural Network (GCNN) to pre-trained DNNs for classifying\nMedMNIST dataset classes, revealing promising prospects for GNNs in medical\nimage analysis. Our results also encourage further exploration of advanced\ngraph-based models such as Graph Attention Networks (GAT) and Graph\nAuto-Encoders in the medical imaging domain. The proposed model yields more\nreliable, interpretable, and accurate outcomes for tasks like semantic\nsegmentation and image classification compared to simpler GCNNs\n","authors":["Aryan Singh","Pepijn Van de Ven","Ciarán Eising","Patrick Denny"],"pdf_url":"https://arxiv.org/pdf/2307.12790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.13170v4","updated":"2023-07-24T13:35:28Z","published":"2022-04-27T20:04:24Z","title":"AdaBest: Minimizing Client Drift in Federated Learning via Adaptive Bias\n  Estimation","summary":"  In Federated Learning (FL), a number of clients or devices collaborate to\ntrain a model without sharing their data. Models are optimized locally at each\nclient and further communicated to a central hub for aggregation. While FL is\nan appealing decentralized training paradigm, heterogeneity among data from\ndifferent clients can cause the local optimization to drift away from the\nglobal objective. In order to estimate and therefore remove this drift,\nvariance reduction techniques have been incorporated into FL optimization\nrecently. However, these approaches inaccurately estimate the clients' drift\nand ultimately fail to remove it properly. In this work, we propose an adaptive\nalgorithm that accurately estimates drift across clients. In comparison to\nprevious works, our approach necessitates less storage and communication\nbandwidth, as well as lower compute costs. Additionally, our proposed\nmethodology induces stability by constraining the norm of estimates for client\ndrift, making it more practical for large scale FL. Experimental findings\ndemonstrate that the proposed algorithm converges significantly faster and\nachieves higher accuracy than the baselines across various FL benchmarks.\n","authors":["Farshid Varno","Marzie Saghayi","Laya Rafiee Sevyeri","Sharut Gupta","Stan Matwin","Mohammad Havaei"],"pdf_url":"https://arxiv.org/pdf/2204.13170v4.pdf","comment":"Published as a conference paper at ECCV 2022; Corrected some typos in\n  the text and a baseline algorithm"},{"id":"http://arxiv.org/abs/2303.12540v2","updated":"2023-07-24T13:35:16Z","published":"2023-03-22T13:16:37Z","title":"Deployment of Image Analysis Algorithms under Prevalence Shifts","summary":"  Domain gaps are among the most relevant roadblocks in the clinical\ntranslation of machine learning (ML)-based solutions for medical image\nanalysis. While current research focuses on new training paradigms and network\narchitectures, little attention is given to the specific effect of prevalence\nshifts on an algorithm deployed in practice. Such discrepancies between class\nfrequencies in the data used for a method's development/validation and that in\nits deployment environment(s) are of great importance, for example in the\ncontext of artificial intelligence (AI) democratization, as disease prevalences\nmay vary widely across time and location. Our contribution is twofold. First,\nwe empirically demonstrate the potentially severe consequences of missing\nprevalence handling by analyzing (i) the extent of miscalibration, (ii) the\ndeviation of the decision threshold from the optimum, and (iii) the ability of\nvalidation metrics to reflect neural network performance on the deployment\npopulation as a function of the discrepancy between development and deployment\nprevalence. Second, we propose a workflow for prevalence-aware image\nclassification that uses estimated deployment prevalences to adjust a trained\nclassifier to a new environment, without requiring additional annotated\ndeployment data. Comprehensive experiments based on a diverse set of 30 medical\nclassification tasks showcase the benefit of the proposed workflow in\ngenerating better classifier decisions and more reliable performance estimates\ncompared to current practice.\n","authors":["Patrick Godau","Piotr Kalinowski","Evangelia Christodoulou","Annika Reinke","Minu Tizabi","Luciana Ferrer","Paul Jäger","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2303.12540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12775v1","updated":"2023-07-24T13:24:56Z","published":"2023-07-24T13:24:56Z","title":"Is attention all you need in medical image analysis? A review","summary":"  Medical imaging is a key component in clinical diagnosis, treatment planning\nand clinical trial design, accounting for almost 90% of all healthcare data.\nCNNs achieved performance gains in medical image analysis (MIA) over the last\nyears. CNNs can efficiently model local pixel interactions and be trained on\nsmall-scale MI data. The main disadvantage of typical CNN models is that they\nignore global pixel relationships within images, which limits their\ngeneralisation ability to understand out-of-distribution data with different\n'global' information. The recent progress of Artificial Intelligence gave rise\nto Transformers, which can learn global relationships from data. However, full\nTransformer models need to be trained on large-scale data and involve\ntremendous computational complexity. Attention and Transformer compartments\n(Transf/Attention) which can well maintain properties for modelling global\nrelationships, have been proposed as lighter alternatives of full Transformers.\nRecently, there is an increasing trend to co-pollinate complementary\nlocal-global properties from CNN and Transf/Attention architectures, which led\nto a new era of hybrid models. The past years have witnessed substantial growth\nin hybrid CNN-Transf/Attention models across diverse MIA problems. In this\nsystematic review, we survey existing hybrid CNN-Transf/Attention models,\nreview and unravel key architectural designs, analyse breakthroughs, and\nevaluate current and future opportunities as well as challenges. We also\nintroduced a comprehensive analysis framework on generalisation opportunities\nof scientific and clinical impact, based on which new data-driven domain\ngeneralisation and adaptation methods can be stimulated.\n","authors":["Giorgos Papanastasiou","Nikolaos Dikaios","Jiahao Huang","Chengjia Wang","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12774v1","updated":"2023-07-24T13:24:19Z","published":"2023-07-24T13:24:19Z","title":"Fast Full-frame Video Stabilization with Iterative Optimization","summary":"  Video stabilization refers to the problem of transforming a shaky video into\na visually pleasing one. The question of how to strike a good trade-off between\nvisual quality and computational speed has remained one of the open challenges\nin video stabilization. Inspired by the analogy between wobbly frames and\njigsaw puzzles, we propose an iterative optimization-based learning approach\nusing synthetic datasets for video stabilization, which consists of two\ninteracting submodules: motion trajectory smoothing and full-frame outpainting.\nFirst, we develop a two-level (coarse-to-fine) stabilizing algorithm based on\nthe probabilistic flow field. The confidence map associated with the estimated\noptical flow is exploited to guide the search for shared regions through\nbackpropagation. Second, we take a divide-and-conquer approach and propose a\nnovel multiframe fusion strategy to render full-frame stabilized views. An\nimportant new insight brought about by our iterative optimization approach is\nthat the target video can be interpreted as the fixed point of nonlinear\nmapping for video stabilization. We formulate video stabilization as a problem\nof minimizing the amount of jerkiness in motion trajectories, which guarantees\nconvergence with the help of fixed-point theory. Extensive experimental results\nare reported to demonstrate the superiority of the proposed approach in terms\nof computational speed and visual quality. The code will be available on\nGitHub.\n","authors":["Weiyue Zhao","Xin Li","Zhan Peng","Xianrui Luo","Xinyi Ye","Hao Lu","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2307.12774v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.12761v1","updated":"2023-07-24T13:05:36Z","published":"2023-07-24T13:05:36Z","title":"LiDAR Meta Depth Completion","summary":"  Depth estimation is one of the essential tasks to be addressed when creating\nmobile autonomous systems. While monocular depth estimation methods have\nimproved in recent times, depth completion provides more accurate and reliable\ndepth maps by additionally using sparse depth information from other sensors\nsuch as LiDAR. However, current methods are specifically trained for a single\nLiDAR sensor. As the scanning pattern differs between sensors, every new sensor\nwould require re-training a specialized depth completion model, which is\ncomputationally inefficient and not flexible. Therefore, we propose to\ndynamically adapt the depth completion model to the used sensor type enabling\nLiDAR adaptive depth completion. Specifically, we propose a meta depth\ncompletion network that uses data patterns derived from the data to learn a\ntask network to alter weights of the main depth completion network to solve a\ngiven depth completion task effectively. The method demonstrates a strong\ncapability to work on multiple LiDAR scanning patterns and can also generalize\nto scanning patterns that are unseen during training. While using a single\nmodel, our method yields significantly better results than a non-adaptive\nbaseline trained on different LiDAR patterns. It outperforms LiDAR-specific\nexpert models for very sparse cases. These advantages allow flexible deployment\nof a single depth completion model on different sensors, which could also prove\nvaluable to process the input of nascent LiDAR technology with adaptive instead\nof fixed scanning patterns.\n","authors":["Wolfgang Boettcher","Lukas Hoyer","Ozan Unal","Dengxin Dai"],"pdf_url":"https://arxiv.org/pdf/2307.12761v1.pdf","comment":"Accepted at IROS 2023"},{"id":"http://arxiv.org/abs/2209.11531v2","updated":"2023-07-24T13:04:48Z","published":"2022-09-23T11:36:32Z","title":"Deep Learning-based Anonymization of Chest Radiographs: A\n  Utility-preserving Measure for Patient Privacy","summary":"  Robust and reliable anonymization of chest radiographs constitutes an\nessential step before publishing large datasets of such for research purposes.\nThe conventional anonymization process is carried out by obscuring personal\ninformation in the images with black boxes and removing or replacing\nmeta-information. However, such simple measures retain biometric information in\nthe chest radiographs, allowing patients to be re-identified by a linkage\nattack. Therefore, there is an urgent need to obfuscate the biometric\ninformation appearing in the images. We propose the first deep learning-based\napproach (PriCheXy-Net) to targetedly anonymize chest radiographs while\nmaintaining data utility for diagnostic and machine learning purposes. Our\nmodel architecture is a composition of three independent neural networks that,\nwhen collectively used, allow for learning a deformation field that is able to\nimpede patient re-identification. Quantitative results on the ChestX-ray14\ndataset show a reduction of patient re-identification from 81.8% to 57.7% (AUC)\nafter re-training with little impact on the abnormality classification\nperformance. This indicates the ability to preserve underlying abnormality\npatterns while increasing patient privacy. Lastly, we compare our proposed\nanonymization approach with two other obfuscation-based methods (Privacy-Net,\nDP-Pix) and demonstrate the superiority of our method towards resolving the\nprivacy-utility trade-off for chest radiographs.\n","authors":["Kai Packhäuser","Sebastian Gündel","Florian Thamm","Felix Denzinger","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2209.11531v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.07620v2","updated":"2023-07-24T13:03:17Z","published":"2023-07-14T20:39:07Z","title":"Generalizable Embeddings with Cross-batch Metric Learning","summary":"  Global average pooling (GAP) is a popular component in deep metric learning\n(DML) for aggregating features. Its effectiveness is often attributed to\ntreating each feature vector as a distinct semantic entity and GAP as a\ncombination of them. Albeit substantiated, such an explanation's algorithmic\nimplications to learn generalizable entities to represent unseen classes, a\ncrucial DML goal, remain unclear. To address this, we formulate GAP as a convex\ncombination of learnable prototypes. We then show that the prototype learning\ncan be expressed as a recursive process fitting a linear predictor to a batch\nof samples. Building on that perspective, we consider two batches of disjoint\nclasses at each iteration and regularize the learning by expressing the samples\nof a batch with the prototypes that are fitted to the other batch. We validate\nour approach on 4 popular DML benchmarks.\n","authors":["Yeti Z. Gurbuz","A. Aydin Alatan"],"pdf_url":"https://arxiv.org/pdf/2307.07620v2.pdf","comment":"\\c{opyright} 2023 IEEE. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses, in any current or\n  future media, including reprinting/republishing this material for advertising\n  or promotional purposes, creating new collective works, for resale or\n  redistribution to servers or lists, or reuse of any copyrighted component of\n  this work in other works"},{"id":"http://arxiv.org/abs/2307.12751v1","updated":"2023-07-24T12:42:45Z","published":"2023-07-24T12:42:45Z","title":"ICF-SRSR: Invertible scale-Conditional Function for Self-Supervised\n  Real-world Single Image Super-Resolution","summary":"  Single image super-resolution (SISR) is a challenging ill-posed problem that\naims to up-sample a given low-resolution (LR) image to a high-resolution (HR)\ncounterpart. Due to the difficulty in obtaining real LR-HR training pairs,\nrecent approaches are trained on simulated LR images degraded by simplified\ndown-sampling operators, e.g., bicubic. Such an approach can be problematic in\npractice because of the large gap between the synthesized and real-world LR\nimages. To alleviate the issue, we propose a novel Invertible scale-Conditional\nFunction (ICF), which can scale an input image and then restore the original\ninput with different scale conditions. By leveraging the proposed ICF, we\nconstruct a novel self-supervised SISR framework (ICF-SRSR) to handle the\nreal-world SR task without using any paired/unpaired training data.\nFurthermore, our ICF-SRSR can generate realistic and feasible LR-HR pairs,\nwhich can make existing supervised SISR networks more robust. Extensive\nexperiments demonstrate the effectiveness of the proposed method in handling\nSISR in a fully self-supervised manner. Our ICF-SRSR demonstrates superior\nperformance compared to the existing methods trained on synthetic paired images\nin real-world scenarios and exhibits comparable performance compared to\nstate-of-the-art supervised/unsupervised methods on public benchmark datasets.\n","authors":["Reyhaneh Neshatavar","Mohsen Yavartanoo","Sanghyun Son","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2307.12751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09629v2","updated":"2023-07-24T12:33:09Z","published":"2023-02-19T17:15:56Z","title":"BiofilmScanner: A Computational Intelligence Approach to Obtain\n  Bacterial Cell Morphological Attributes from Biofilm Image","summary":"  Desulfovibrio alaskensis G20 (DA-G20) is utilized as a model for\nsulfate-reducing bacteria (SRB) that are associated with corrosion issues\ncaused by microorganisms. SRB-based biofilms are thought to be responsible for\nthe billion-dollar-per-year bio-corrosion of metal infrastructure.\nUnderstanding the extraction of the bacterial cells' shape and size properties\nin the SRB-biofilm at different growth stages will assist with the design of\nanti-corrosion techniques. However, numerous issues affect current approaches,\nincluding time-consuming geometric property extraction, low efficiency, and\nhigh error rates. This paper proposes BiofilScanner, a Yolact-based deep\nlearning method integrated with invariant moments to address these problems.\nOur approach efficiently detects and segments bacterial cells in an SRB image\nwhile simultaneously invariant moments measure the geometric characteristics of\nthe segmented cells with low errors. The numerical experiments of the proposed\nmethod demonstrate that the BiofilmScanner is 2.1x and 6.8x faster than our\nearlier Mask-RCNN and DLv3+ methods for detecting, segmenting, and measuring\nthe geometric properties of the cell. Furthermore, the BiofilmScanner achieved\nan F1-score of 85.28% while Mask-RCNN and DLv3+ obtained F1-scores of 77.67%\nand 75.18%, respectively.\n","authors":["Md Hafizur Rahman","Md Ali Azam","Md Abir Hossen","Shankarachary Ragi","Venkataramana Gadhamshetty"],"pdf_url":"https://arxiv.org/pdf/2302.09629v2.pdf","comment":"Submitted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2307.12732v1","updated":"2023-07-24T12:24:07Z","published":"2023-07-24T12:24:07Z","title":"CLIP-KD: An Empirical Study of Distilling CLIP Models","summary":"  CLIP has become a promising language-supervised visual pre-training framework\nand achieves excellent performance over a wide range of tasks. This paper aims\nto distill small CLIP models supervised by a large teacher CLIP model. We\npropose several distillation strategies, including relation, feature, gradient\nand contrastive paradigm, to examine the impact on CLIP distillation. We show\nthat the simplest feature mimicry with MSE loss performs best. Moreover,\ninteractive contrastive learning and relation-based distillation are also\ncritical in performance improvement. We apply the unified method to distill\nseveral student networks trained on 15 million (image, text) pairs.\nDistillation improves the student CLIP models consistently over zero-shot\nImageNet classification and cross-modal retrieval benchmarks. We hope our\nempirical study will become an important baseline for future CLIP distillation\nresearch. The code is available at \\url{https://github.com/winycg/CLIP-KD}.\n","authors":["Chuanguang Yang","Zhulin An","Libo Huang","Junyu Bi","Xinqiang Yu","Han Yang","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2307.12732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12730v1","updated":"2023-07-24T12:22:19Z","published":"2023-07-24T12:22:19Z","title":"COCO-O: A Benchmark for Object Detectors under Natural Distribution\n  Shifts","summary":"  Practical object detection application can lose its effectiveness on image\ninputs with natural distribution shifts. This problem leads the research\ncommunity to pay more attention on the robustness of detectors under\nOut-Of-Distribution (OOD) inputs. Existing works construct datasets to\nbenchmark the detector's OOD robustness for a specific application scenario,\ne.g., Autonomous Driving. However, these datasets lack universality and are\nhard to benchmark general detectors built on common tasks such as COCO. To give\na more comprehensive robustness assessment, we introduce\nCOCO-O(ut-of-distribution), a test dataset based on COCO with 6 types of\nnatural distribution shifts. COCO-O has a large distribution gap with training\ndata and results in a significant 55.7% relative performance drop on a Faster\nR-CNN detector. We leverage COCO-O to conduct experiments on more than 100\nmodern object detectors to investigate if their improvements are credible or\njust over-fitting to the COCO test set. Unfortunately, most classic detectors\nin early years do not exhibit strong OOD generalization. We further study the\nrobustness effect on recent breakthroughs of detector's architecture design,\naugmentation and pre-training techniques. Some empirical findings are revealed:\n1) Compared with detection head or neck, backbone is the most important part\nfor robustness; 2) An end-to-end detection transformer design brings no\nenhancement, and may even reduce robustness; 3) Large-scale foundation models\nhave made a great leap on robust object detection. We hope our COCO-O could\nprovide a rich testbed for robustness study of object detection. The dataset\nwill be available at\n\\url{https://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o}.\n","authors":["Xiaofeng Mao","Yuefeng Chen","Yao Zhu","Da Chen","Hang Su","Rong Zhang","Hui Xue"],"pdf_url":"https://arxiv.org/pdf/2307.12730v1.pdf","comment":"To appear in ICCV2023,\n  https://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o"},{"id":"http://arxiv.org/abs/2307.12729v1","updated":"2023-07-24T12:21:33Z","published":"2023-07-24T12:21:33Z","title":"Persistent-Transient Duality: A Multi-mechanism Approach for Modeling\n  Human-Object Interaction","summary":"  Humans are highly adaptable, swiftly switching between different modes to\nprogressively handle different tasks, situations and contexts. In Human-object\ninteraction (HOI) activities, these modes can be attributed to two mechanisms:\n(1) the large-scale consistent plan for the whole activity and (2) the\nsmall-scale children interactive actions that start and end along the timeline.\nWhile neuroscience and cognitive science have confirmed this multi-mechanism\nnature of human behavior, machine modeling approaches for human motion are\ntrailing behind. While attempted to use gradually morphing structures (e.g.,\ngraph attention networks) to model the dynamic HOI patterns, they miss the\nexpeditious and discrete mode-switching nature of the human motion. To bridge\nthat gap, this work proposes to model two concurrent mechanisms that jointly\ncontrol human motion: the Persistent process that runs continually on the\nglobal scale, and the Transient sub-processes that operate intermittently on\nthe local context of the human while interacting with objects. These two\nmechanisms form an interactive Persistent-Transient Duality that\nsynergistically governs the activity sequences. We model this conceptual\nduality by a parent-child neural network of Persistent and Transient channels\nwith a dedicated neural module for dynamic mechanism switching. The framework\nis trialed on HOI motion forecasting. On two rich datasets and a wide variety\nof settings, the model consistently delivers superior performances, proving its\nsuitability for the challenge.\n","authors":["Hung Tran","Vuong Le","Svetha Venkatesh","Truyen Tran"],"pdf_url":"https://arxiv.org/pdf/2307.12729v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2303.12865v3","updated":"2023-07-24T12:08:50Z","published":"2023-03-22T18:59:48Z","title":"NeRF-GAN Distillation for Efficient 3D-Aware Generation with\n  Convolutions","summary":"  Pose-conditioned convolutional generative models struggle with high-quality\n3D-consistent image generation from single-view datasets, due to their lack of\nsufficient 3D priors. Recently, the integration of Neural Radiance Fields\n(NeRFs) and generative models, such as Generative Adversarial Networks (GANs),\nhas transformed 3D-aware generation from single-view images. NeRF-GANs exploit\nthe strong inductive bias of neural 3D representations and volumetric rendering\nat the cost of higher computational complexity. This study aims at revisiting\npose-conditioned 2D GANs for efficient 3D-aware generation at inference time by\ndistilling 3D knowledge from pretrained NeRF-GANs. We propose a simple and\neffective method, based on re-using the well-disentangled latent space of a\npre-trained NeRF-GAN in a pose-conditioned convolutional network to directly\ngenerate 3D-consistent images corresponding to the underlying 3D\nrepresentations. Experiments on several datasets demonstrate that the proposed\nmethod obtains results comparable with volumetric rendering in terms of quality\nand 3D consistency while benefiting from the computational advantage of\nconvolutional networks. The code will be available at:\nhttps://github.com/mshahbazi72/NeRF-GAN-Distillation\n","authors":["Mohamad Shahbazi","Evangelos Ntavelis","Alessio Tonioni","Edo Collins","Danda Pani Paudel","Martin Danelljan","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2303.12865v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12721v1","updated":"2023-07-24T12:03:50Z","published":"2023-07-24T12:03:50Z","title":"AMAE: Adaptation of Pre-Trained Masked Autoencoder for Dual-Distribution\n  Anomaly Detection in Chest X-Rays","summary":"  Unsupervised anomaly detection in medical images such as chest radiographs is\nstepping into the spotlight as it mitigates the scarcity of the labor-intensive\nand costly expert annotation of anomaly data. However, nearly all existing\nmethods are formulated as a one-class classification trained only on\nrepresentations from the normal class and discard a potentially significant\nportion of the unlabeled data. This paper focuses on a more practical setting,\ndual distribution anomaly detection for chest X-rays, using the entire training\ndata, including both normal and unlabeled images. Inspired by a modern\nself-supervised vision transformer model trained using partial image inputs to\nreconstruct missing image regions -- we propose AMAE, a two-stage algorithm for\nadaptation of the pre-trained masked autoencoder (MAE). Starting from MAE\ninitialization, AMAE first creates synthetic anomalies from only normal\ntraining images and trains a lightweight classifier on frozen transformer\nfeatures. Subsequently, we propose an adaptation strategy to leverage unlabeled\nimages containing anomalies. The adaptation scheme is accomplished by assigning\npseudo-labels to unlabeled images and using two separate MAE based modules to\nmodel the normative and anomalous distributions of pseudo-labeled images. The\neffectiveness of the proposed adaptation strategy is evaluated with different\nanomaly ratios in an unlabeled training set. AMAE leads to consistent\nperformance gains over competing self-supervised and dual distribution anomaly\ndetection methods, setting the new state-of-the-art on three public chest X-ray\nbenchmarks: RSNA, NIH-CXR, and VinDr-CXR.\n","authors":["Behzad Bozorgtabar","Dwarikanath Mahapatra","Jean-Philippe Thiran"],"pdf_url":"https://arxiv.org/pdf/2307.12721v1.pdf","comment":"To be presented at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.12718v1","updated":"2023-07-24T11:59:07Z","published":"2023-07-24T11:59:07Z","title":"CarPatch: A Synthetic Benchmark for Radiance Field Evaluation on Vehicle\n  Components","summary":"  Neural Radiance Fields (NeRFs) have gained widespread recognition as a highly\neffective technique for representing 3D reconstructions of objects and scenes\nderived from sets of images. Despite their efficiency, NeRF models can pose\nchallenges in certain scenarios such as vehicle inspection, where the lack of\nsufficient data or the presence of challenging elements (e.g. reflections)\nstrongly impact the accuracy of the reconstruction. To this aim, we introduce\nCarPatch, a novel synthetic benchmark of vehicles. In addition to a set of\nimages annotated with their intrinsic and extrinsic camera parameters, the\ncorresponding depth maps and semantic segmentation masks have been generated\nfor each view. Global and part-based metrics have been defined and used to\nevaluate, compare, and better characterize some state-of-the-art techniques.\nThe dataset is publicly released at\nhttps://aimagelab.ing.unimore.it/go/carpatch and can be used as an evaluation\nguide and as a baseline for future work on this challenging topic.\n","authors":["Davide Di Nucci","Alessandro Simoni","Matteo Tomei","Luca Ciuffreda","Roberto Vezzani","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2307.12718v1.pdf","comment":"Accepted at ICIAP2023"},{"id":"http://arxiv.org/abs/2307.12717v1","updated":"2023-07-24T11:58:58Z","published":"2023-07-24T11:58:58Z","title":"Dense Transformer based Enhanced Coding Network for Unsupervised Metal\n  Artifact Reduction","summary":"  CT images corrupted by metal artifacts have serious negative effects on\nclinical diagnosis. Considering the difficulty of collecting paired data with\nground truth in clinical settings, unsupervised methods for metal artifact\nreduction are of high interest. However, it is difficult for previous\nunsupervised methods to retain structural information from CT images while\nhandling the non-local characteristics of metal artifacts. To address these\nchallenges, we proposed a novel Dense Transformer based Enhanced Coding Network\n(DTEC-Net) for unsupervised metal artifact reduction. Specifically, we\nintroduce a Hierarchical Disentangling Encoder, supported by the high-order\ndense process, and transformer to obtain densely encoded sequences with\nlong-range correspondence. Then, we present a second-order disentanglement\nmethod to improve the dense sequence's decoding process. Extensive experiments\nand model discussions illustrate DTEC-Net's effectiveness, which outperforms\nthe previous state-of-the-art methods on a benchmark dataset, and greatly\nreduces metal artifacts while restoring richer texture details.\n","authors":["Wangduo Xie","Matthew B. Blaschko"],"pdf_url":"https://arxiv.org/pdf/2307.12717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09340v3","updated":"2023-07-24T11:34:21Z","published":"2023-03-16T14:21:45Z","title":"Improving Automated Hemorrhage Detection in Sparse-view Computed\n  Tomography via Deep Convolutional Neural Network based Artifact Reduction","summary":"  Purpose: Sparse-view computed tomography (CT) is an effective way to reduce\ndose by lowering the total number of views acquired, albeit at the expense of\nimage quality, which, in turn, can impact the ability to detect diseases. We\nexplore deep learning-based artifact reduction in sparse-view cranial CT scans\nand its impact on automated hemorrhage detection. Methods: We trained a U-Net\nfor artefact reduction on simulated sparse-view cranial CT scans from 3000\npatients obtained from a public dataset and reconstructed with varying levels\nof sub-sampling. Additionally, we trained a convolutional neural network on\nfully sampled CT data from 17,545 patients for automated hemorrhage detection.\nWe evaluated the classification performance using the area under the receiver\noperator characteristic curves (AUC-ROCs) with corresponding 95% confidence\nintervals (CIs) and the DeLong test, along with confusion matrices. The\nperformance of the U-Net was compared to an analytical approach based on total\nvariation (TV). Results: The U-Net performed superior compared to unprocessed\nand TV-processed images with respect to image quality and automated hemorrhage\ndiagnosis. With U-Net post-processing, the number of views can be reduced from\n4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;\n0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256\nviews (0.967; 0.964-0.969) with a slight performance decrease (P<.001).\nConclusion: The results suggest that U-Net based artifact reduction\nsubstantially enhances automated hemorrhage detection in sparse-view cranial\nCTs. Our findings highlight that appropriate post-processing is crucial for\noptimal image quality and diagnostic accuracy while minimizing radiation dose.\n","authors":["Johannes Thalhammer","Manuel Schultheiss","Tina Dorosti","Tobias Lasser","Franz Pfeiffer","Daniela Pfeiffer","Florian Schaff"],"pdf_url":"https://arxiv.org/pdf/2303.09340v3.pdf","comment":"11 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2011.09094v3","updated":"2023-07-24T11:28:46Z","published":"2020-11-18T05:16:11Z","title":"UP-DETR: Unsupervised Pre-training for Object Detection with\n  Transformers","summary":"  DEtection TRansformer (DETR) for object detection reaches competitive\nperformance compared with Faster R-CNN via a transformer encoder-decoder\narchitecture. However, trained with scratch transformers, DETR needs\nlarge-scale training data and an extreme long training schedule even on COCO\ndataset. Inspired by the great success of pre-training transformers in natural\nlanguage processing, we propose a novel pretext task named random query patch\ndetection in Unsupervised Pre-training DETR (UP-DETR). Specifically, we\nrandomly crop patches from the given image and then feed them as queries to the\ndecoder. The model is pre-trained to detect these query patches from the input\nimage. During the pre-training, we address two critical issues: multi-task\nlearning and multi-query localization. (1) To trade off classification and\nlocalization preferences in the pretext task, we find that freezing the CNN\nbackbone is the prerequisite for the success of pre-training transformers. (2)\nTo perform multi-query localization, we develop UP-DETR with multi-query patch\ndetection with attention mask. Besides, UP-DETR also provides a unified\nperspective for fine-tuning object detection and one-shot detection tasks. In\nour experiments, UP-DETR significantly boosts the performance of DETR with\nfaster convergence and higher average precision on object detection, one-shot\ndetection and panoptic segmentation. Code and pre-training models:\nhttps://github.com/dddzg/up-detr.\n","authors":["Zhigang Dai","Bolun Cai","Yugeng Lin","Junying Chen"],"pdf_url":"https://arxiv.org/pdf/2011.09094v3.pdf","comment":"Accepted by TPAMI 2022 and CVPR 2021"},{"id":"http://arxiv.org/abs/2307.12698v1","updated":"2023-07-24T11:27:14Z","published":"2023-07-24T11:27:14Z","title":"MC-JEPA: A Joint-Embedding Predictive Architecture for Self-Supervised\n  Learning of Motion and Content Features","summary":"  Self-supervised learning of visual representations has been focusing on\nlearning content features, which do not capture object motion or location, and\nfocus on identifying and differentiating objects in images and videos. On the\nother hand, optical flow estimation is a task that does not involve\nunderstanding the content of the images on which it is estimated. We unify the\ntwo approaches and introduce MC-JEPA, a joint-embedding predictive architecture\nand self-supervised learning approach to jointly learn optical flow and content\nfeatures within a shared encoder, demonstrating that the two associated\nobjectives; the optical flow estimation objective and the self-supervised\nlearning objective; benefit from each other and thus learn content features\nthat incorporate motion information. The proposed approach achieves performance\non-par with existing unsupervised optical flow benchmarks, as well as with\ncommon self-supervised learning approaches on downstream tasks such as semantic\nsegmentation of images and videos.\n","authors":["Adrien Bardes","Jean Ponce","Yann LeCun"],"pdf_url":"https://arxiv.org/pdf/2307.12698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10763v3","updated":"2023-07-24T11:15:47Z","published":"2023-02-12T12:19:57Z","title":"Contrastive Learning and the Emergence of Attributes Associations","summary":"  In response to an object presentation, supervised learning schemes generally\nrespond with a parsimonious label. Upon a similar presentation we humans\nrespond again with a label, but are flooded, in addition, by a myriad of\nassociations. A significant portion of these consist of the presented object\nattributes. Contrastive learning is a semi-supervised learning scheme based on\nthe application of identity preserving transformations on the object input\nrepresentations. It is conjectured in this work that these same applied\ntransformations preserve, in addition to the identity of the presented object,\nalso the identity of its semantically meaningful attributes. The corollary of\nthis is that the output representations of such a contrastive learning scheme\ncontain valuable information not only for the classification of the presented\nobject, but also for the presence or absence decision of any attribute of\ninterest. Simulation results which demonstrate this idea and the feasibility of\nthis conjecture are presented.\n","authors":["Daniel N. Nissani"],"pdf_url":"https://arxiv.org/pdf/2302.10763v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2304.02941v2","updated":"2023-07-24T10:57:15Z","published":"2023-04-06T08:56:18Z","title":"Dr. KID: Direct Remeshing and K-set Isometric Decomposition for Scalable\n  Physicalization of Organic Shapes","summary":"  Dr. KID is an algorithm that uses isometric decomposition for the\nphysicalization of potato-shaped organic models in a puzzle fashion. The\nalgorithm begins with creating a simple, regular triangular surface mesh of\norganic shapes, followed by iterative k-means clustering and remeshing. For\nclustering, we need similarity between triangles (segments) which is defined as\na distance function. The distance function maps each triangle's shape to a\nsingle point in the virtual 3D space. Thus, the distance between the triangles\nindicates their degree of dissimilarity. K-means clustering uses this distance\nand sorts of segments into k classes. After this, remeshing is applied to\nminimize the distance between triangles within the same cluster by making their\nshapes identical. Clustering and remeshing are repeated until the distance\nbetween triangles in the same cluster reaches an acceptable threshold. We adopt\na curvature-aware strategy to determine the surface thickness and finalize\npuzzle pieces for 3D printing. Identical hinges and holes are created for\nassembling the puzzle components. For smoother outcomes, we use triangle\nsubdivision along with curvature-aware clustering, generating curved triangular\npatches for 3D printing. Our algorithm was evaluated using various models, and\nthe 3D-printed results were analyzed. Findings indicate that our algorithm\nperforms reliably on target organic shapes with minimal loss of input geometry.\n","authors":["Dawar Khan","Ciril Bohak","Ivan Viola"],"pdf_url":"https://arxiv.org/pdf/2304.02941v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12676v1","updated":"2023-07-24T10:30:54Z","published":"2023-07-24T10:30:54Z","title":"Damage Vision Mining Opportunity for Imbalanced Anomaly Detection","summary":"  In past decade, previous balanced datasets have been used to advance\nalgorithms for classification, object detection, semantic segmentation, and\nanomaly detection in industrial applications. Specifically, for condition-based\nmaintenance, automating visual inspection is crucial to ensure high quality.\nDeterioration prognostic attempts to optimize the fine decision process for\npredictive maintenance and proactive repair. In civil infrastructure and living\nenvironment, damage data mining cannot avoid the imbalanced data issue because\nof rare unseen events and high quality status by improved operations. For\nvisual inspection, deteriorated class acquired from the surface of concrete and\nsteel components are occasionally imbalanced. From numerous related surveys, we\nsummarize that imbalanced data problems can be categorized into four types; 1)\nmissing range of target and label valuables, 2) majority-minority class\nimbalance, 3) foreground-background of spatial imbalance, 4) long-tailed class\nof pixel-wise imbalance. Since 2015, there has been many imbalanced studies\nusing deep learning approaches that includes regression, image classification,\nobject detection, semantic segmentation. However, anomaly detection for\nimbalanced data is not yet well known. In the study, we highlight one-class\nanomaly detection application whether anomalous class or not, and demonstrate\nclear examples on imbalanced vision datasets: wooden, concrete deterioration,\nand disaster damage. We provide key results on damage vision mining advantage,\nhypothesizing that the more effective range of positive ratio, the higher\naccuracy gain of anomaly detection application. Finally, the applicability of\nthe damage learning methods, limitations, and future works are mentioned.\n","authors":["Takato Yasuno"],"pdf_url":"https://arxiv.org/pdf/2307.12676v1.pdf","comment":"12 pages, 14 figures, 8 tables"},{"id":"http://arxiv.org/abs/2307.12674v1","updated":"2023-07-24T10:24:13Z","published":"2023-07-24T10:24:13Z","title":"Industrial Segment Anything -- a Case Study in Aircraft Manufacturing,\n  Intralogistics, Maintenance, Repair, and Overhaul","summary":"  Deploying deep learning-based applications in specialized domains like the\naircraft production industry typically suffers from the training data\navailability problem. Only a few datasets represent non-everyday objects,\nsituations, and tasks. Recent advantages in research around Vision Foundation\nModels (VFM) opened a new area of tasks and models with high generalization\ncapabilities in non-semantic and semantic predictions. As recently demonstrated\nby the Segment Anything Project, exploiting VFM's zero-shot capabilities is a\npromising direction in tackling the boundaries spanned by data, context, and\nsensor variety. Although, investigating its application within specific domains\nis subject to ongoing research. This paper contributes here by surveying\napplications of the SAM in aircraft production-specific use cases. We include\nmanufacturing, intralogistics, as well as maintenance, repair, and overhaul\nprocesses, also representing a variety of other neighboring industrial domains.\nBesides presenting the various use cases, we further discuss the injection of\ndomain knowledge.\n","authors":["Keno Moenck","Arne Wendt","Philipp Prünte","Julian Koch","Arne Sahrhage","Johann Gierecker","Ole Schmedemann","Falko Kähler","Dirk Holst","Martin Gomse","Thorsten Schüppstuhl","Daniel Schoepflin"],"pdf_url":"https://arxiv.org/pdf/2307.12674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12672v1","updated":"2023-07-24T10:20:14Z","published":"2023-07-24T10:20:14Z","title":"Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked\n  Image Modeling","summary":"  In dynamic Magnetic Resonance Imaging (MRI), k-space is typically\nundersampled due to limited scan time, resulting in aliasing artifacts in the\nimage domain. Hence, dynamic MR reconstruction requires not only modeling\nspatial frequency components in the x and y directions of k-space but also\nconsidering temporal redundancy. Most previous works rely on image-domain\nregularizers (priors) to conduct MR reconstruction. In contrast, we focus on\ninterpolating the undersampled k-space before obtaining images with Fourier\ntransform. In this work, we connect masked image modeling with k-space\ninterpolation and propose a novel Transformer-based k-space Global\nInterpolation Network, termed k-GIN. Our k-GIN learns global dependencies among\nlow- and high-frequency components of 2D+t k-space and uses it to interpolate\nunsampled data. Further, we propose a novel k-space Iterative Refinement Module\n(k-IRM) to enhance the high-frequency components learning. We evaluate our\napproach on 92 in-house 2D+t cardiac MR subjects and compare it to MR\nreconstruction methods with image-domain regularizers. Experiments show that\nour proposed k-space interpolation method quantitatively and qualitatively\noutperforms baseline methods. Importantly, the proposed approach achieves\nsubstantially higher robustness and generalizability in cases of\nhighly-undersampled MR data.\n","authors":["Jiazhen Pan","Suprosanna Shit","Özgün Turgut","Wenqi Huang","Hongwei Bran Li","Nil Stolt-Ansó","Thomas Küstner","Kerstin Hammernik","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2307.12672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07250v2","updated":"2023-07-24T10:10:25Z","published":"2023-04-14T16:58:23Z","title":"Fusing Structure from Motion and Simulation-Augmented Pose Regression\n  from Optical Flow for Challenging Indoor Environments","summary":"  The localization of objects is a crucial task in various applications such as\nrobotics, virtual and augmented reality, and the transportation of goods in\nwarehouses. Recent advances in deep learning have enabled the localization\nusing monocular visual cameras. While structure from motion (SfM) predicts the\nabsolute pose from a point cloud, absolute pose regression (APR) methods learn\na semantic understanding of the environment through neural networks. However,\nboth fields face challenges caused by the environment such as motion blur,\nlighting changes, repetitive patterns, and feature-less structures. This study\naims to address these challenges by incorporating additional information and\nregularizing the absolute pose using relative pose regression (RPR) methods.\nRPR methods suffer under different challenges, i.e., motion blur. The optical\nflow between consecutive images is computed using the Lucas-Kanade algorithm,\nand the relative pose is predicted using an auxiliary small recurrent\nconvolutional network. The fusion of absolute and relative poses is a complex\ntask due to the mismatch between the global and local coordinate systems.\nState-of-the-art methods fusing absolute and relative poses use pose graph\noptimization (PGO) to regularize the absolute pose predictions using relative\nposes. In this work, we propose recurrent fusion networks to optimally align\nabsolute and relative pose predictions to improve the absolute pose prediction.\nWe evaluate eight different recurrent units and construct a simulation\nenvironment to pre-train the APR and RPR networks for better generalized\ntraining. Additionally, we record a large database of different scenarios in a\nchallenging large-scale indoor environment that mimics a warehouse with\ntransportation robots. We conduct hyperparameter searches and experiments to\nshow the effectiveness of our recurrent fusion method compared to PGO.\n","authors":["Felix Ott","Lucas Heublein","David Rügamer","Bernd Bischl","Christopher Mutschler"],"pdf_url":"https://arxiv.org/pdf/2304.07250v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12656v1","updated":"2023-07-24T09:54:49Z","published":"2023-07-24T09:54:49Z","title":"A Theoretically Guaranteed Quaternion Weighted Schatten p-norm\n  Minimization Method for Color Image Restoration","summary":"  Inspired by the fact that the matrix formulated by nonlocal similar patches\nin a natural image is of low rank, the rank approximation issue have been\nextensively investigated over the past decades, among which weighted nuclear\nnorm minimization (WNNM) and weighted Schatten $p$-norm minimization (WSNM) are\ntwo prevailing methods have shown great superiority in various image\nrestoration (IR) problems. Due to the physical characteristic of color images,\ncolor image restoration (CIR) is often a much more difficult task than its\ngrayscale image counterpart. However, when applied to CIR, the traditional\nWNNM/WSNM method only processes three color channels individually and fails to\nconsider their cross-channel correlations. Very recently, a quaternion-based\nWNNM approach (QWNNM) has been developed to mitigate this issue, which is\ncapable of representing the color image as a whole in the quaternion domain and\npreserving the inherent correlation among the three color channels. Despite its\nempirical success, unfortunately, the convergence behavior of QWNNM has not\nbeen strictly studied yet. In this paper, on the one side, we extend the WSNM\ninto quaternion domain and correspondingly propose a novel quaternion-based\nWSNM model (QWSNM) for tackling the CIR problems. Extensive experiments on two\nrepresentative CIR tasks, including color image denoising and deblurring,\ndemonstrate that the proposed QWSNM method performs favorably against many\nstate-of-the-art alternatives, in both quantitative and qualitative\nevaluations. On the other side, more importantly, we preliminarily provide a\ntheoretical convergence analysis, that is, by modifying the quaternion\nalternating direction method of multipliers (QADMM) through a simple\ncontinuation strategy, we theoretically prove that both the solution sequences\ngenerated by the QWNNM and QWSNM have fixed-point convergence guarantees.\n","authors":["Qing-Hua Zhang","Liang-Tian He","Yi-Lun Wang","Liang-Jian Deng","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2307.12656v1.pdf","comment":"46 pages, 10 figures; references added"},{"id":"http://arxiv.org/abs/2302.01162v5","updated":"2023-07-24T09:41:07Z","published":"2023-02-02T15:37:46Z","title":"Get3DHuman: Lifting StyleGAN-Human into a 3D Generative Model using\n  Pixel-aligned Reconstruction Priors","summary":"  Fast generation of high-quality 3D digital humans is important to a vast\nnumber of applications ranging from entertainment to professional concerns.\nRecent advances in differentiable rendering have enabled the training of 3D\ngenerative models without requiring 3D ground truths. However, the quality of\nthe generated 3D humans still has much room to improve in terms of both\nfidelity and diversity. In this paper, we present Get3DHuman, a novel 3D human\nframework that can significantly boost the realism and diversity of the\ngenerated outcomes by only using a limited budget of 3D ground-truth data. Our\nkey observation is that the 3D generator can profit from human-related priors\nlearned through 2D human generators and 3D reconstructors. Specifically, we\nbridge the latent space of Get3DHuman with that of StyleGAN-Human via a\nspecially-designed prior network, where the input latent code is mapped to the\nshape and texture feature volumes spanned by the pixel-aligned 3D\nreconstructor. The outcomes of the prior network are then leveraged as the\nsupervisory signals for the main generator network. To ensure effective\ntraining, we further propose three tailored losses applied to the generated\nfeature volumes and the intermediate feature maps. Extensive experiments\ndemonstrate that Get3DHuman greatly outperforms the other state-of-the-art\napproaches and can support a wide range of applications including shape\ninterpolation, shape re-texturing, and single-view reconstruction through\nlatent inversion.\n","authors":["Zhangyang Xiong","Di Kang","Derong Jin","Weikai Chen","Linchao Bao","Shuguang Cui","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2302.01162v5.pdf","comment":"ICCV 2023, project page:\n  https://x-zhangyang.github.io/2023_Get3DHuman/"},{"id":"http://arxiv.org/abs/2307.12644v1","updated":"2023-07-24T09:35:47Z","published":"2023-07-24T09:35:47Z","title":"Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation\n  of rPPG","summary":"  Remote Photoplethysmography (rPPG) is a technology that utilizes the light\nabsorption properties of hemoglobin, captured via camera, to analyze and\nmeasure blood volume pulse (BVP). By analyzing the measured BVP, various\nphysiological signals such as heart rate, stress levels, and blood pressure can\nbe derived, enabling applications such as the early prediction of\ncardiovascular diseases. rPPG is a rapidly evolving field as it allows the\nmeasurement of vital signals using camera-equipped devices without the need for\nadditional devices such as blood pressure monitors or pulse oximeters, and\nwithout the assistance of medical experts. Despite extensive efforts and\nadvances in this field, serious challenges remain, including issues related to\nskin color, camera characteristics, ambient lighting, and other sources of\nnoise, which degrade performance accuracy. We argue that fair and evaluable\nbenchmarking is urgently required to overcome these challenges and make any\nmeaningful progress from both academic and commercial perspectives. In most\nexisting work, models are trained, tested, and validated only on limited\ndatasets. Worse still, some studies lack available code or reproducibility,\nmaking it difficult to fairly evaluate and compare performance. Therefore, the\npurpose of this study is to provide a benchmarking framework to evaluate\nvarious rPPG techniques across a wide range of datasets for fair evaluation and\ncomparison, including both conventional non-deep neural network (non-DNN) and\ndeep neural network (DNN) methods. GitHub URL:\nhttps://github.com/remotebiosensing/rppg.\n","authors":["Dae Yeol Kim","Eunsu Goh","KwangKee Lee","JongEui Chae","JongHyeon Mun","Junyeong Na","Chae-bong Sohn","Do-Yup Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12644v1.pdf","comment":"19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2304.03981v2","updated":"2023-07-24T09:24:04Z","published":"2023-04-08T10:47:41Z","title":"Uncertainty-inspired Open Set Learning for Retinal Anomaly\n  Identification","summary":"  Failure to recognize samples from the classes unseen during training is a\nmajor limitation of artificial intelligence in the real-world implementation\nfor recognition and classification of retinal anomalies. We established an\nuncertainty-inspired open-set (UIOS) model, which was trained with fundus\nimages of 9 retinal conditions. Besides assessing the probability of each\ncategory, UIOS also calculated an uncertainty score to express its confidence.\nOur UIOS model with thresholding strategy achieved an F1 score of 99.55%,\n97.01% and 91.91% for the internal testing set, external target categories\n(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1\nscore of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS\ncorrectly predicted high uncertainty scores, which would prompt the need for a\nmanual check in the datasets of non-target categories retinal diseases,\nlow-quality fundus images, and non-fundus images. UIOS provides a robust method\nfor real-world screening of retinal anomalies.\n","authors":["Meng Wang","Tian Lin","Lianyu Wang","Aidi Lin","Ke Zou","Xinxing Xu","Yi Zhou","Yuanyuan Peng","Qingquan Meng","Yiming Qian","Guoyao Deng","Zhiqun Wu","Junhong Chen","Jianhong Lin","Mingzhi Zhang","Weifang Zhu","Changqing Zhang","Daoqiang Zhang","Rick Siow Mong Goh","Yong Liu","Chi Pui Pang","Xinjian Chen","Haoyu Chen","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2304.03981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12637v1","updated":"2023-07-24T09:22:09Z","published":"2023-07-24T09:22:09Z","title":"PG-RCNN: Semantic Surface Point Generation for 3D Object Detection","summary":"  One of the main challenges in LiDAR-based 3D object detection is that the\nsensors often fail to capture the complete spatial information about the\nobjects due to long distance and occlusion. Two-stage detectors with point\ncloud completion approaches tackle this problem by adding more points to the\nregions of interest (RoIs) with a pre-trained network. However, these methods\ngenerate dense point clouds of objects for all region proposals, assuming that\nobjects always exist in the RoIs. This leads to the indiscriminate point\ngeneration for incorrect proposals as well. Motivated by this, we propose Point\nGeneration R-CNN (PG-RCNN), a novel end-to-end detector that generates semantic\nsurface points of foreground objects for accurate detection. Our method uses a\njointly trained RoI point generation module to process the contextual\ninformation of RoIs and estimate the complete shape and displacement of\nforeground objects. For every generated point, PG-RCNN assigns a semantic\nfeature that indicates the estimated foreground probability. Extensive\nexperiments show that the point clouds generated by our method provide\ngeometrically and semantically rich information for refining false positive and\nmisaligned proposals. PG-RCNN achieves competitive performance on the KITTI\nbenchmark, with significantly fewer parameters than state-of-the-art models.\nThe code is available at https://github.com/quotation2520/PG-RCNN.\n","authors":["Inyong Koo","Inyoung Lee","Se-Ho Kim","Hee-Seon Kim","Woo-jin Jeon","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12637v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.11643v2","updated":"2023-07-24T09:18:52Z","published":"2023-07-21T15:22:32Z","title":"Morphological Image Analysis and Feature Extraction for Reasoning with\n  AI-based Defect Detection and Classification Models","summary":"  As the use of artificial intelligent (AI) models becomes more prevalent in\nindustries such as engineering and manufacturing, it is essential that these\nmodels provide transparent reasoning behind their predictions. This paper\nproposes the AI-Reasoner, which extracts the morphological characteristics of\ndefects (DefChars) from images and utilises decision trees to reason with the\nDefChar values. Thereafter, the AI-Reasoner exports visualisations (i.e.\ncharts) and textual explanations to provide insights into outputs made by\nmasked-based defect detection and classification models. It also provides\neffective mitigation strategies to enhance data pre-processing and overall\nmodel performance. The AI-Reasoner was tested on explaining the outputs of an\nIE Mask R-CNN model using a set of 366 images containing defects. The results\ndemonstrated its effectiveness in explaining the IE Mask R-CNN model's\npredictions. Overall, the proposed AI-Reasoner provides a solution for\nimproving the performance of AI models in industrial applications that require\ndefect analysis.\n","authors":["Jiajun Zhang","Georgina Cosma","Sarah Bugby","Axel Finke","Jason Watkins"],"pdf_url":"https://arxiv.org/pdf/2307.11643v2.pdf","comment":"8 pages, 3 figures, 5 tables; submitted to 2023 IEEE symposium series\n  on computational intelligence (SSCI)"},{"id":"http://arxiv.org/abs/2307.12634v1","updated":"2023-07-24T09:16:05Z","published":"2023-07-24T09:16:05Z","title":"Automatic lobe segmentation using attentive cross entropy and end-to-end\n  fissure generation","summary":"  The automatic lung lobe segmentation algorithm is of great significance for\nthe diagnosis and treatment of lung diseases, however, which has great\nchallenges due to the incompleteness of pulmonary fissures in lung CT images\nand the large variability of pathological features. Therefore, we propose a new\nautomatic lung lobe segmentation framework, in which we urge the model to pay\nattention to the area around the pulmonary fissure during the training process,\nwhich is realized by a task-specific loss function. In addition, we introduce\nan end-to-end pulmonary fissure generation method in the auxiliary pulmonary\nfissure segmentation task, without any additional network branch. Finally, we\npropose a registration-based loss function to alleviate the convergence\ndifficulty of the Dice loss supervised pulmonary fissure segmentation task. We\nachieve 97.83% and 94.75% dice scores on our private dataset STLB and public\nLUNA16 dataset respectively.\n","authors":["Qi Su","Na Wang","Jiawen Xie","Yinan Chen","Xiaofan Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12634v1.pdf","comment":"5 pages, 3 figures, published to 'IEEE International Symposium on\n  Biomedical Imaging (ISBI) 2023'"},{"id":"http://arxiv.org/abs/2307.12630v1","updated":"2023-07-24T09:08:30Z","published":"2023-07-24T09:08:30Z","title":"Semi-Supervised Medical Image Segmentation with Co-Distribution\n  Alignment","summary":"  Medical image segmentation has made significant progress when a large amount\nof labeled data are available. However, annotating medical image segmentation\ndatasets is expensive due to the requirement of professional skills.\nAdditionally, classes are often unevenly distributed in medical images, which\nseverely affects the classification performance on minority classes. To address\nthese problems, this paper proposes Co-Distribution Alignment (Co-DA) for\nsemi-supervised medical image segmentation. Specifically, Co-DA aligns marginal\npredictions on unlabeled data to marginal predictions on labeled data in a\nclass-wise manner with two differently initialized models before using the\npseudo-labels generated by one model to supervise the other. Besides, we design\nan over-expectation cross-entropy loss for filtering the unlabeled pixels to\nreduce noise in their pseudo-labels. Quantitative and qualitative experiments\non three public datasets demonstrate that the proposed approach outperforms\nexisting state-of-the-art semi-supervised medical image segmentation methods on\nboth the 2D CaDIS dataset and the 3D LGE-MRI and ACDC datasets, achieving an\nmIoU of 0.8515 with only 24% labeled data on CaDIS, and a Dice score of 0.8824\nand 0.8773 with only 20% data on LGE-MRI and ACDC, respectively.\n","authors":["Tao Wang","Zhongzheng Huang","Jiawei Wu","Yuanzheng Cai","Zuoyong Li"],"pdf_url":"https://arxiv.org/pdf/2307.12630v1.pdf","comment":"Paper appears in Bioengineering 2023, 10(7), 869"},{"id":"http://arxiv.org/abs/2307.12622v1","updated":"2023-07-24T08:51:49Z","published":"2023-07-24T08:51:49Z","title":"Phase Match for Out-of-Distribution Generalization","summary":"  The Fourier transform, serving as an explicit decomposition method for visual\nsignals, has been employed to explain the out-of-distribution generalization\nbehaviors of Convolutional Neural Networks (CNNs). Previous research and\nempirical studies have indicated that the amplitude spectrum plays a decisive\nrole in CNN recognition, but it is susceptible to disturbance caused by\ndistribution shifts. On the other hand, the phase spectrum preserves\nhighly-structured spatial information, which is crucial for visual\nrepresentation learning. In this paper, we aim to clarify the relationships\nbetween Domain Generalization (DG) and the frequency components by introducing\na Fourier-based structural causal model. Specifically, we interpret the phase\nspectrum as semi-causal factors and the amplitude spectrum as non-causal\nfactors. Building upon these observations, we propose Phase Match (PhaMa) to\naddress DG problems. Our method introduces perturbations on the amplitude\nspectrum and establishes spatial relationships to match the phase components.\nThrough experiments on multiple benchmarks, we demonstrate that our proposed\nmethod achieves state-of-the-art performance in domain generalization and\nout-of-distribution robustness tasks.\n","authors":["Chengming Hu","Rui Wang","Hao Chen","Zhouwang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12619v1","updated":"2023-07-24T08:49:20Z","published":"2023-07-24T08:49:20Z","title":"Sparse annotation strategies for segmentation of short axis cardiac MRI","summary":"  Short axis cardiac MRI segmentation is a well-researched topic, with\nexcellent results achieved by state-of-the-art models in a supervised setting.\nHowever, annotating MRI volumes is time-consuming and expensive. Many different\napproaches (e.g. transfer learning, data augmentation, few-shot learning, etc.)\nhave emerged in an effort to use fewer annotated data and still achieve similar\nperformance as a fully supervised model. Nevertheless, to the best of our\nknowledge, none of these works focus on which slices of MRI volumes are most\nimportant to annotate for yielding the best segmentation results. In this\npaper, we investigate the effects of training with sparse volumes, i.e.\nreducing the number of cases annotated, and sparse annotations, i.e. reducing\nthe number of slices annotated per case. We evaluate the segmentation\nperformance using the state-of-the-art nnU-Net model on two public datasets to\nidentify which slices are the most important to annotate. We have shown that\ntraining on a significantly reduced dataset (48 annotated volumes) can give a\nDice score greater than 0.85 and results comparable to using the full dataset\n(160 and 240 volumes for each dataset respectively). In general, training on\nmore slice annotations provides more valuable information compared to training\non more volumes. Further, annotating slices from the middle of volumes yields\nthe most beneficial results in terms of segmentation performance, and the\napical region the worst. When evaluating the trade-off between annotating\nvolumes against slices, annotating as many slices as possible instead of\nannotating more volumes is a better strategy.\n","authors":["Josh Stein","Maxime Di Folco","Julia Schnabel"],"pdf_url":"https://arxiv.org/pdf/2307.12619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12618v1","updated":"2023-07-24T08:47:45Z","published":"2023-07-24T08:47:45Z","title":"Attribute Regularized Soft Introspective VAE: Towards Cardiac Attribute\n  Regularization Through MRI Domains","summary":"  Deep generative models have emerged as influential instruments for data\ngeneration and manipulation. Enhancing the controllability of these models by\nselectively modifying data attributes has been a recent focus. Variational\nAutoencoders (VAEs) have shown promise in capturing hidden attributes but often\nproduce blurry reconstructions. Controlling these attributes through different\nimaging domains is difficult in medical imaging. Recently, Soft Introspective\nVAE leverage the benefits of both VAEs and Generative Adversarial Networks\n(GANs), which have demonstrated impressive image synthesis capabilities, by\nincorporating an adversarial loss into VAE training. In this work, we propose\nthe Attributed Soft Introspective VAE (Attri-SIVAE) by incorporating an\nattribute regularized loss, into the Soft-Intro VAE framework. We evaluate\nexperimentally the proposed method on cardiac MRI data from different domains,\nsuch as various scanner vendors and acquisition centers. The proposed method\nachieves similar performance in terms of reconstruction and regularization\ncompared to the state-of-the-art Attributed regularized VAE but additionally\nalso succeeds in keeping the same regularization level when tested on a\ndifferent dataset, unlike the compared method.\n","authors":["Maxime Di Folco","Cosmin Bercea","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2307.12618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12616v1","updated":"2023-07-24T08:44:25Z","published":"2023-07-24T08:44:25Z","title":"CTVIS: Consistent Training for Online Video Instance Segmentation","summary":"  The discrimination of instance embeddings plays a vital role in associating\ninstances across time for online video instance segmentation (VIS). Instance\nembedding learning is directly supervised by the contrastive loss computed upon\nthe contrastive items (CIs), which are sets of anchor/positive/negative\nembeddings. Recent online VIS methods leverage CIs sourced from one reference\nframe only, which we argue is insufficient for learning highly discriminative\nembeddings. Intuitively, a possible strategy to enhance CIs is replicating the\ninference phase during training. To this end, we propose a simple yet effective\ntraining strategy, called Consistent Training for Online VIS (CTVIS), which\ndevotes to aligning the training and inference pipelines in terms of building\nCIs. Specifically, CTVIS constructs CIs by referring inference the\nmomentum-averaged embedding and the memory bank storage mechanisms, and adding\nnoise to the relevant embeddings. Such an extension allows a reliable\ncomparison between embeddings of current instances and the stable\nrepresentations of historical instances, thereby conferring an advantage in\nmodeling VIS challenges such as occlusion, re-identification, and deformation.\nEmpirically, CTVIS outstrips the SOTA VIS models by up to +5.0 points on three\nVIS benchmarks, including YTVIS19 (55.1% AP), YTVIS21 (50.1% AP) and OVIS\n(35.5% AP). Furthermore, we find that pseudo-videos transformed from images can\ntrain robust models surpassing fully-supervised ones.\n","authors":["Kaining Ying","Qing Zhong","Weian Mao","Zhenhua Wang","Hao Chen","Lin Yuanbo Wu","Yifan Liu","Chengxiang Fan","Yunzhi Zhuge","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2307.12616v1.pdf","comment":"Accepted by ICCV 2023. The code is available at\n  https://github.com/KainingYing/CTVIS"},{"id":"http://arxiv.org/abs/2307.12612v1","updated":"2023-07-24T08:39:11Z","published":"2023-07-24T08:39:11Z","title":"Less is More: Focus Attention for Efficient DETR","summary":"  DETR-like models have significantly boosted the performance of detectors and\neven outperformed classical convolutional models. However, all tokens are\ntreated equally without discrimination brings a redundant computational burden\nin the traditional encoder structure. The recent sparsification strategies\nexploit a subset of informative tokens to reduce attention complexity\nmaintaining performance through the sparse encoder. But these methods tend to\nrely on unreliable model statistics. Moreover, simply reducing the token\npopulation hinders the detection performance to a large extent, limiting the\napplication of these sparse models. We propose Focus-DETR, which focuses\nattention on more informative tokens for a better trade-off between computation\nefficiency and model accuracy. Specifically, we reconstruct the encoder with\ndual attention, which includes a token scoring mechanism that considers both\nlocalization and category semantic information of the objects from multi-scale\nfeature maps. We efficiently abandon the background queries and enhance the\nsemantic interaction of the fine-grained object queries based on the scores.\nCompared with the state-of-the-art sparse DETR-like detectors under the same\nsetting, our Focus-DETR gets comparable complexity while achieving 50.4AP\n(+2.2) on COCO. The code is available at\nhttps://github.com/huawei-noah/noah-research/tree/master/Focus-DETR and\nhttps://gitee.com/mindspore/models/tree/master/research/cv/Focus-DETR.\n","authors":["Dehua Zheng","Wenhui Dong","Hailin Hu","Xinghao Chen","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12612v1.pdf","comment":"8 pages, 6 figures, accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2307.12607v1","updated":"2023-07-24T08:32:27Z","published":"2023-07-24T08:32:27Z","title":"ExWarp: Extrapolation and Warping-based Temporal Supersampling for\n  High-frequency Displays","summary":"  High-frequency displays are gaining immense popularity because of their\nincreasing use in video games and virtual reality applications. However, the\nissue is that the underlying GPUs cannot continuously generate frames at this\nhigh rate -- this results in a less smooth and responsive experience.\nFurthermore, if the frame rate is not synchronized with the refresh rate, the\nuser may experience screen tearing and stuttering. Previous works propose\nincreasing the frame rate to provide a smooth experience on modern displays by\npredicting new frames based on past or future frames. Interpolation and\nextrapolation are two widely used algorithms that predict new frames.\nInterpolation requires waiting for the future frame to make a prediction, which\nadds additional latency. On the other hand, extrapolation provides a better\nquality of experience because it relies solely on past frames -- it does not\nincur any additional latency. The simplest method to extrapolate a frame is to\nwarp the previous frame using motion vectors; however, the warped frame may\ncontain improperly rendered visual artifacts due to dynamic objects -- this\nmakes it very challenging to design such a scheme. Past work has used DNNs to\nget good accuracy, however, these approaches are slow. This paper proposes\nExwarp -- an approach based on reinforcement learning (RL) to intelligently\nchoose between the slower DNN-based extrapolation and faster warping-based\nmethods to increase the frame rate by 4x with an almost negligible reduction in\nthe perceived image quality.\n","authors":["Akanksha Dixit","Yashashwee Chakrabarty","Smruti R. Sarangi"],"pdf_url":"https://arxiv.org/pdf/2307.12607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07515v2","updated":"2023-07-24T08:10:52Z","published":"2023-04-15T09:39:52Z","title":"S3M: Scalable Statistical Shape Modeling through Unsupervised\n  Correspondences","summary":"  Statistical shape models (SSMs) are an established way to represent the\nanatomy of a population with various clinically relevant applications. However,\nthey typically require domain expertise, and labor-intensive landmark\nannotations to construct. We address these shortcomings by proposing an\nunsupervised method that leverages deep geometric features and functional\ncorrespondences to simultaneously learn local and global shape structures\nacross population anatomies. Our pipeline significantly improves unsupervised\ncorrespondence estimation for SSMs compared to baseline methods, even on highly\nirregular surface topologies. We demonstrate this for two different anatomical\nstructures: the thyroid and a multi-chamber heart dataset. Furthermore, our\nmethod is robust enough to learn from noisy neural network predictions,\npotentially enabling scaling SSMs to larger patient populations without manual\nsegmentation annotation.\n","authors":["Lennart Bastian","Alexander Baumann","Emily Hoppe","Vincent Bürgin","Ha Young Kim","Mahdi Saleh","Benjamin Busam","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2304.07515v2.pdf","comment":"Accepted at MICCAI 2023. 13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.12591v1","updated":"2023-07-24T08:06:46Z","published":"2023-07-24T08:06:46Z","title":"SwinMM: Masked Multi-view with Swin Transformers for 3D Medical Image\n  Segmentation","summary":"  Recent advancements in large-scale Vision Transformers have made significant\nstrides in improving pre-trained models for medical image segmentation.\nHowever, these methods face a notable challenge in acquiring a substantial\namount of pre-training data, particularly within the medical field. To address\nthis limitation, we present Masked Multi-view with Swin Transformers (SwinMM),\na novel multi-view pipeline for enabling accurate and data-efficient\nself-supervised medical image analysis. Our strategy harnesses the potential of\nmulti-view information by incorporating two principal components. In the\npre-training phase, we deploy a masked multi-view encoder devised to\nconcurrently train masked multi-view observations through a range of diverse\nproxy tasks. These tasks span image reconstruction, rotation, contrastive\nlearning, and a novel task that employs a mutual learning paradigm. This new\ntask capitalizes on the consistency between predictions from various\nperspectives, enabling the extraction of hidden multi-view information from 3D\nmedical data. In the fine-tuning stage, a cross-view decoder is developed to\naggregate the multi-view information through a cross-attention block. Compared\nwith the previous state-of-the-art self-supervised learning method Swin UNETR,\nSwinMM demonstrates a notable advantage on several medical image segmentation\ntasks. It allows for a smooth integration of multi-view information,\nsignificantly boosting both the accuracy and data-efficiency of the model. Code\nand models are available at https://github.com/UCSC-VLAA/SwinMM/.\n","authors":["Yiqing Wang","Zihan Li","Jieru Mei","Zihao Wei","Li Liu","Chen Wang","Shengtian Sang","Alan Yuille","Cihang Xie","Yuyin Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.12591v1.pdf","comment":"MICCAI 2023; project page: https://github.com/UCSC-VLAA/SwinMM/"},{"id":"http://arxiv.org/abs/2307.12580v1","updated":"2023-07-24T07:51:40Z","published":"2023-07-24T07:51:40Z","title":"SL: Stable Learning in Source-Free Domain Adaption for Medical Image\n  Segmentation","summary":"  Deep learning techniques for medical image analysis usually suffer from the\ndomain shift between source and target data. Most existing works focus on\nunsupervised domain adaptation (UDA). However, in practical applications,\nprivacy issues are much more severe. For example, the data of different\nhospitals have domain shifts due to equipment problems, and data of the two\ndomains cannot be available simultaneously because of privacy. In this\nchallenge defined as Source-Free UDA, the previous UDA medical methods are\nlimited. Although a variety of medical source-free unsupervised domain adaption\n(MSFUDA) methods have been proposed, we found they fall into an over-fitting\ndilemma called \"longer training, worse performance.\" Therefore, we propose the\nStable Learning (SL) strategy to address the dilemma. SL is a scalable method\nand can be integrated with other research, which consists of Weight\nConsolidation and Entropy Increase. First, we apply Weight Consolidation to\nretain domain-invariant knowledge and then we design Entropy Increase to avoid\nover-learning. Comparative experiments prove the effectiveness of SL. We also\nhave done extensive ablation experiments. Besides, We will release codes\nincluding a variety of MSFUDA methods.\n","authors":["Yixin Chen","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12577v1","updated":"2023-07-24T07:49:01Z","published":"2023-07-24T07:49:01Z","title":"PRIOR: Prototype Representation Joint Learning from Medical Images and\n  Reports","summary":"  Contrastive learning based vision-language joint pre-training has emerged as\na successful representation learning strategy. In this paper, we present a\nprototype representation learning framework incorporating both global and local\nalignment between medical images and reports. In contrast to standard global\nmulti-modality alignment methods, we employ a local alignment module for\nfine-grained representation. Furthermore, a cross-modality conditional\nreconstruction module is designed to interchange information across modalities\nin the training phase by reconstructing masked images and reports. For\nreconstructing long reports, a sentence-wise prototype memory bank is\nconstructed, enabling the network to focus on low-level localized visual and\nhigh-level clinical linguistic features. Additionally, a non-auto-regressive\ngeneration paradigm is proposed for reconstructing non-sequential reports.\nExperimental results on five downstream tasks, including supervised\nclassification, zero-shot classification, image-to-text retrieval, semantic\nsegmentation, and object detection, show the proposed method outperforms other\nstate-of-the-art methods across multiple datasets and under different dataset\nsize settings. The code is available at https://github.com/QtacierP/PRIOR.\n","authors":["Pujin Cheng","Li Lin","Junyan Lyu","Yijin Huang","Wenhan Luo","Xiaoying Tang"],"pdf_url":"https://arxiv.org/pdf/2307.12577v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12574v1","updated":"2023-07-24T07:46:06Z","published":"2023-07-24T07:46:06Z","title":"A Good Student is Cooperative and Reliable: CNN-Transformer\n  Collaborative Learning for Semantic Segmentation","summary":"  In this paper, we strive to answer the question \"how to collaboratively learn\nconvolutional neural network (CNN)-based and vision transformer (ViT)-based\nmodels by selecting and exchanging the reliable knowledge between them for\nsemantic segmentation?\" Accordingly, we propose an online knowledge\ndistillation (KD) framework that can simultaneously learn compact yet effective\nCNN-based and ViT-based models with two key technical breakthroughs to take\nfull advantage of CNNs and ViT while compensating their limitations. Firstly,\nwe propose heterogeneous feature distillation (HFD) to improve students'\nconsistency in low-layer feature space by mimicking heterogeneous features\nbetween CNNs and ViT. Secondly, to facilitate the two students to learn\nreliable knowledge from each other, we propose bidirectional selective\ndistillation (BSD) that can dynamically transfer selective knowledge. This is\nachieved by 1) region-wise BSD determining the directions of knowledge\ntransferred between the corresponding regions in the feature space and 2)\npixel-wise BSD discerning which of the prediction knowledge to be transferred\nin the logit space. Extensive experiments on three benchmark datasets\ndemonstrate that our proposed framework outperforms the state-of-the-art online\ndistillation methods by a large margin, and shows its efficacy in learning\ncollaboratively between ViT-based and CNN-based models.\n","authors":["Jinjing Zhu","Yunhao Luo","Xu Zheng","Hao Wang","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12574v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2210.10495v3","updated":"2023-07-24T07:43:31Z","published":"2022-10-19T12:04:47Z","title":"ADPS: Asymmetric Distillation Post-Segmentation for Image Anomaly\n  Detection","summary":"  Knowledge Distillation-based Anomaly Detection (KDAD) methods rely on the\nteacher-student paradigm to detect and segment anomalous regions by contrasting\nthe unique features extracted by both networks. However, existing KDAD methods\nsuffer from two main limitations: 1) the student network can effortlessly\nreplicate the teacher network's representations, and 2) the features of the\nteacher network serve solely as a ``reference standard\" and are not fully\nleveraged. Toward this end, we depart from the established paradigm and instead\npropose an innovative approach called Asymmetric Distillation Post-Segmentation\n(ADPS). Our ADPS employs an asymmetric distillation paradigm that takes\ndistinct forms of the same image as the input of the teacher-student networks,\ndriving the student network to learn discriminating representations for\nanomalous regions.\n  Meanwhile, a customized Weight Mask Block (WMB) is proposed to generate a\ncoarse anomaly localization mask that transfers the distilled knowledge\nacquired from the asymmetric paradigm to the teacher network. Equipped with\nWMB, the proposed Post-Segmentation Module (PSM) is able to effectively detect\nand segment abnormal regions with fine structures and clear boundaries.\nExperimental results demonstrate that the proposed ADPS outperforms the\nstate-of-the-art methods in detecting and segmenting anomalies. Surprisingly,\nADPS significantly improves Average Precision (AP) metric by 9% and 20% on the\nMVTec AD and KolektorSDD2 datasets, respectively.\n","authors":["Peng Xing","Hao Tang","Jinhui Tang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2210.10495v3.pdf","comment":"11pages,9 figures"},{"id":"http://arxiv.org/abs/2307.12571v1","updated":"2023-07-24T07:39:22Z","published":"2023-07-24T07:39:22Z","title":"MataDoc: Margin and Text Aware Document Dewarping for Arbitrary Boundary","summary":"  Document dewarping from a distorted camera-captured image is of great value\nfor OCR and document understanding. The document boundary plays an important\nrole which is more evident than the inner region in document dewarping. Current\nlearning-based methods mainly focus on complete boundary cases, leading to poor\ndocument correction performance of documents with incomplete boundaries. In\ncontrast to these methods, this paper proposes MataDoc, the first method\nfocusing on arbitrary boundary document dewarping with margin and text aware\nregularizations. Specifically, we design the margin regularization by\nexplicitly considering background consistency to enhance boundary perception.\nMoreover, we introduce word position consistency to keep text lines straight in\nrectified document images. To produce a comprehensive evaluation of MataDoc, we\npropose a novel benchmark ArbDoc, mainly consisting of document images with\narbitrary boundaries in four typical scenarios. Extensive experiments confirm\nthe superiority of MataDoc with consideration for the incomplete boundary on\nArbDoc and also demonstrate the effectiveness of the proposed method on\nDocUNet, DIR300, and WarpDoc datasets.\n","authors":["Beiya Dai","Xing li","Qunyi Xie","Yulin Li","Xiameng Qin","Chengquan Zhang","Kun Yao","Junyu Han"],"pdf_url":"https://arxiv.org/pdf/2307.12571v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2307.12560v1","updated":"2023-07-24T07:03:22Z","published":"2023-07-24T07:03:22Z","title":"Interpolating between Images with Diffusion Models","summary":"  One little-explored frontier of image generation and editing is the task of\ninterpolating between two input images, a feature missing from all currently\ndeployed image generation pipelines. We argue that such a feature can expand\nthe creative applications of such models, and propose a method for zero-shot\ninterpolation using latent diffusion models. We apply interpolation in the\nlatent space at a sequence of decreasing noise levels, then perform denoising\nconditioned on interpolated text embeddings derived from textual inversion and\n(optionally) subject poses. For greater consistency, or to specify additional\ncriteria, we can generate several candidates and use CLIP to select the highest\nquality image. We obtain convincing interpolations across diverse subject\nposes, image styles, and image content, and show that standard quantitative\nmetrics such as FID are insufficient to measure the quality of an\ninterpolation. Code and data are available at\nhttps://clintonjwang.github.io/interpolation.\n","authors":["Clinton J. Wang","Polina Golland"],"pdf_url":"https://arxiv.org/pdf/2307.12560v1.pdf","comment":"Presented at ICML 2023 Workshop on Challenges of Deploying Generative\n  AI"},{"id":"http://arxiv.org/abs/2203.01923v4","updated":"2023-07-24T06:59:56Z","published":"2022-03-03T18:56:08Z","title":"Recovering 3D Human Mesh from Monocular Images: A Survey","summary":"  Estimating human pose and shape from monocular images is a long-standing\nproblem in computer vision. Since the release of statistical body models, 3D\nhuman mesh recovery has been drawing broader attention. With the same goal of\nobtaining well-aligned and physically plausible mesh results, two paradigms\nhave been developed to overcome challenges in the 2D-to-3D lifting process: i)\nan optimization-based paradigm, where different data terms and regularization\nterms are exploited as optimization objectives; and ii) a regression-based\nparadigm, where deep learning techniques are embraced to solve the problem in\nan end-to-end fashion. Meanwhile, continuous efforts are devoted to improving\nthe quality of 3D mesh labels for a wide range of datasets. Though remarkable\nprogress has been achieved in the past decade, the task is still challenging\ndue to flexible body motions, diverse appearances, complex environments, and\ninsufficient in-the-wild annotations. To the best of our knowledge, this is the\nfirst survey that focuses on the task of monocular 3D human mesh recovery. We\nstart with the introduction of body models and then elaborate recovery\nframeworks and training objectives by providing in-depth analyses of their\nstrengths and weaknesses. We also summarize datasets, evaluation metrics, and\nbenchmark results. Open issues and future directions are discussed in the end,\nhoping to motivate researchers and facilitate their research in this area. A\nregularly updated project page can be found at\nhttps://github.com/tinatiansjz/hmr-survey.\n","authors":["Yating Tian","Hongwen Zhang","Yebin Liu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2203.01923v4.pdf","comment":"Accepted to IEEE TPAMI, Survey on monocular 3D human mesh recovery,\n  Project page: https://github.com/tinatiansjz/hmr-survey"},{"id":"http://arxiv.org/abs/2307.12558v1","updated":"2023-07-24T06:51:07Z","published":"2023-07-24T06:51:07Z","title":"Revisiting Event-based Video Frame Interpolation","summary":"  Dynamic vision sensors or event cameras provide rich complementary\ninformation for video frame interpolation. Existing state-of-the-art methods\nfollow the paradigm of combining both synthesis-based and warping networks.\nHowever, few of those methods fully respect the intrinsic characteristics of\nevents streams. Given that event cameras only encode intensity changes and\npolarity rather than color intensities, estimating optical flow from events is\narguably more difficult than from RGB information. We therefore propose to\nincorporate RGB information in an event-guided optical flow refinement\nstrategy. Moreover, in light of the quasi-continuous nature of the time signals\nprovided by event cameras, we propose a divide-and-conquer strategy in which\nevent-based intermediate frame synthesis happens incrementally in multiple\nsimplified stages rather than in a single, long stage. Extensive experiments on\nboth synthetic and real-world datasets show that these modifications lead to\nmore reliable and realistic intermediate frame results than previous video\nframe interpolation methods. Our findings underline that a careful\nconsideration of event characteristics such as high temporal density and\nelevated noise benefits interpolation accuracy.\n","authors":["Jiaben Chen","Yichen Zhu","Dongze Lian","Jiaqi Yang","Yifu Wang","Renrui Zhang","Xinhang Liu","Shenhan Qian","Laurent Kneip","Shenghua Gao"],"pdf_url":"https://arxiv.org/pdf/2307.12558v1.pdf","comment":"Accepted by IROS2023 Project Site:\n  https://jiabenchen.github.io/revisit_event"},{"id":"http://arxiv.org/abs/2307.12548v1","updated":"2023-07-24T06:33:52Z","published":"2023-07-24T06:33:52Z","title":"MFMAN-YOLO: A Method for Detecting Pole-like Obstacles in Complex\n  Environment","summary":"  In real-world traffic, there are various uncertainties and complexities in\nroad and weather conditions. To solve the problem that the feature information\nof pole-like obstacles in complex environments is easily lost, resulting in low\ndetection accuracy and low real-time performance, a multi-scale hybrid\nattention mechanism detection algorithm is proposed in this paper. First, the\noptimal transport function Monge-Kantorovich (MK) is incorporated not only to\nsolve the problem of overlapping multiple prediction frames with optimal\nmatching but also the MK function can be regularized to prevent model\nover-fitting; then, the features at different scales are up-sampled separately\naccording to the optimized efficient multi-scale feature pyramid. Finally, the\nextraction of multi-scale feature space channel information is enhanced in\ncomplex environments based on the hybrid attention mechanism, which suppresses\nthe irrelevant complex environment background information and focuses the\nfeature information of pole-like obstacles. Meanwhile, this paper conducts real\nroad test experiments in a variety of complex environments. The experimental\nresults show that the detection precision, recall, and average precision of the\nmethod are 94.7%, 93.1%, and 97.4%, respectively, and the detection frame rate\nis 400 f/s. This research method can detect pole-like obstacles in a complex\nroad environment in real time and accurately, which further promotes innovation\nand progress in the field of automatic driving.\n","authors":["Lei Cai","Hao Wang","Congling Zhou","Yongqiang Wang","Boyu Liu"],"pdf_url":"https://arxiv.org/pdf/2307.12548v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2301.01482v5","updated":"2023-07-24T06:31:58Z","published":"2023-01-04T08:22:34Z","title":"Underwater Object Tracker: UOSTrack for Marine Organism Grasping of\n  Underwater Vehicles","summary":"  A visual single-object tracker is an indispensable component of underwater\nvehicles (UVs) in marine organism grasping tasks. Its accuracy and stability\nare imperative to guide the UVs to perform grasping behavior. Although\nsingle-object trackers show competitive performance in the challenge of\nunderwater image degradation, there are still issues with sample imbalance and\nexclusion of similar objects that need to be addressed for application in\nmarine organism grasping. This paper proposes Underwater OSTrack (UOSTrack),\nwhich consists of underwater image and open-air sequence hybrid training\n(UOHT), and motion-based post-processing (MBPP). The UOHT training paradigm is\ndesigned to train the sample-imbalanced underwater tracker so that the tracker\nis exposed to a great number of underwater domain training samples and learns\nthe feature expressions. The MBPP paradigm is proposed to exclude similar\nobjects. It uses the estimation box predicted with a Kalman filter and the\ncandidate boxes in the response map to relocate the lost tracked object in the\ncandidate area. UOSTrack achieves an average performance improvement of 4.41%\nand 7.98% maximum compared to state-of-the-art methods on various benchmarks,\nrespectively. Field experiments have verified the accuracy and stability of our\nproposed UOSTrack for UVs in marine organism grasping tasks. More details can\nbe found at https://github.com/LiYunfengLYF/UOSTrack.\n","authors":["Yunfeng Li","Bo Wang","Ye Li","Zhuoyan Liu","Wei Huo","Yueming Li","Jian Cao"],"pdf_url":"https://arxiv.org/pdf/2301.01482v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12545v1","updated":"2023-07-24T06:22:37Z","published":"2023-07-24T06:22:37Z","title":"Towards Video Anomaly Retrieval from Video Anomaly Detection: New\n  Benchmarks and Model","summary":"  Video anomaly detection (VAD) has been paid increasing attention due to its\npotential applications, its current dominant tasks focus on online detecting\nanomalies% at the frame level, which can be roughly interpreted as the binary\nor multiple event classification. However, such a setup that builds\nrelationships between complicated anomalous events and single labels, e.g.,\n``vandalism'', is superficial, since single labels are deficient to\ncharacterize anomalous events. In reality, users tend to search a specific\nvideo rather than a series of approximate videos. Therefore, retrieving\nanomalous events using detailed descriptions is practical and positive but few\nresearches focus on this. In this context, we propose a novel task called Video\nAnomaly Retrieval (VAR), which aims to pragmatically retrieve relevant\nanomalous videos by cross-modalities, e.g., language descriptions and\nsynchronous audios. Unlike the current video retrieval where videos are assumed\nto be temporally well-trimmed with short duration, VAR is devised to retrieve\nlong untrimmed videos which may be partially relevant to the given query. To\nachieve this, we present two large-scale VAR benchmarks, UCFCrime-AR and\nXDViolence-AR, constructed on top of prevalent anomaly datasets. Meanwhile, we\ndesign a model called Anomaly-Led Alignment Network (ALAN) for VAR. In ALAN, we\npropose an anomaly-led sampling to focus on key segments in long untrimmed\nvideos. Then, we introduce an efficient pretext task to enhance semantic\nassociations between video-text fine-grained representations. Besides, we\nleverage two complementary alignments to further match cross-modal contents.\nExperimental results on two benchmarks reveal the challenges of VAR task and\nalso demonstrate the advantages of our tailored method.\n","authors":["Peng Wu","Jing Liu","Xiangteng He","Yuxin Peng","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12545v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2307.12542v1","updated":"2023-07-24T06:12:37Z","published":"2023-07-24T06:12:37Z","title":"Client-Level Differential Privacy via Adaptive Intermediary in Federated\n  Medical Imaging","summary":"  Despite recent progress in enhancing the privacy of federated learning (FL)\nvia differential privacy (DP), the trade-off of DP between privacy protection\nand performance is still underexplored for real-world medical scenario. In this\npaper, we propose to optimize the trade-off under the context of client-level\nDP, which focuses on privacy during communications. However, FL for medical\nimaging involves typically much fewer participants (hospitals) than other\ndomains (e.g., mobile devices), thus ensuring clients be differentially private\nis much more challenging. To tackle this problem, we propose an adaptive\nintermediary strategy to improve performance without harming privacy.\nSpecifically, we theoretically find splitting clients into sub-clients, which\nserve as intermediaries between hospitals and the server, can mitigate the\nnoises introduced by DP without harming privacy. Our proposed approach is\nempirically evaluated on both classification and segmentation tasks using two\npublic datasets, and its effectiveness is demonstrated with significant\nperformance improvements and comprehensive analytical studies. Code is\navailable at: https://github.com/med-air/Client-DP-FL.\n","authors":["Meirui Jiang","Yuan Zhong","Anjie Le","Xiaoxiao Li","Qi Dou"],"pdf_url":"https://arxiv.org/pdf/2307.12542v1.pdf","comment":"Accepted by 26th International Conference on Medical Image Computing\n  and Computer Assisted Intervention (MICCAI'23)"},{"id":"http://arxiv.org/abs/2303.05021v3","updated":"2023-07-24T06:06:27Z","published":"2023-03-09T03:48:24Z","title":"DiffusionDepth: Diffusion Denoising Approach for Monocular Depth\n  Estimation","summary":"  Monocular depth estimation is a challenging task that predicts the pixel-wise\ndepth from a single 2D image. Current methods typically model this problem as a\nregression or classification task. We propose DiffusionDepth, a new approach\nthat reformulates monocular depth estimation as a denoising diffusion process.\nIt learns an iterative denoising process to `denoise' random depth distribution\ninto a depth map with the guidance of monocular visual conditions. The process\nis performed in the latent space encoded by a dedicated depth encoder and\ndecoder. Instead of diffusing ground truth (GT) depth, the model learns to\nreverse the process of diffusing the refined depth of itself into random depth\ndistribution. This self-diffusion formulation overcomes the difficulty of\napplying generative models to sparse GT depth scenarios. The proposed approach\nbenefits this task by refining depth estimation step by step, which is superior\nfor generating accurate and highly detailed depth maps. Experimental results on\nKITTI and NYU-Depth-V2 datasets suggest that a simple yet efficient diffusion\napproach could reach state-of-the-art performance in both indoor and outdoor\nscenarios with acceptable inference time.\n","authors":["Yiqun Duan","Xianda Guo","Zheng Zhu"],"pdf_url":"https://arxiv.org/pdf/2303.05021v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12540v1","updated":"2023-07-24T06:04:12Z","published":"2023-07-24T06:04:12Z","title":"SelFormaly: Towards Task-Agnostic Unified Anomaly Detection","summary":"  The core idea of visual anomaly detection is to learn the normality from\nnormal images, but previous works have been developed specifically for certain\ntasks, leading to fragmentation among various tasks: defect detection, semantic\nanomaly detection, multi-class anomaly detection, and anomaly clustering. This\none-task-one-model approach is resource-intensive and incurs high maintenance\ncosts as the number of tasks increases. This paper presents SelFormaly, a\nuniversal and powerful anomaly detection framework. We emphasize the necessity\nof our off-the-shelf approach by pointing out a suboptimal issue with\nfluctuating performance in previous online encoder-based methods. In addition,\nwe question the effectiveness of using ConvNets as previously employed in the\nliterature and confirm that self-supervised ViTs are suitable for unified\nanomaly detection. We introduce back-patch masking and discover the new role of\ntop k-ratio feature matching to achieve unified and powerful anomaly detection.\nBack-patch masking eliminates irrelevant regions that possibly hinder\ntarget-centric detection with representations of the scene layout. The top\nk-ratio feature matching unifies various anomaly levels and tasks. Finally,\nSelFormaly achieves state-of-the-art results across various datasets for all\nthe aforementioned tasks.\n","authors":["Yujin Lee","Harin Lim","Hyunsoo Yoon"],"pdf_url":"https://arxiv.org/pdf/2307.12540v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.12534v1","updated":"2023-07-24T05:43:34Z","published":"2023-07-24T05:43:34Z","title":"Towards Generalizable Deepfake Detection by Primary Region\n  Regularization","summary":"  The existing deepfake detection methods have reached a bottleneck in\ngeneralizing to unseen forgeries and manipulation approaches. Based on the\nobservation that the deepfake detectors exhibit a preference for overfitting\nthe specific primary regions in input, this paper enhances the generalization\ncapability from a novel regularization perspective. This can be simply achieved\nby augmenting the images through primary region removal, thereby preventing the\ndetector from over-relying on data bias. Our method consists of two stages,\nnamely the static localization for primary region maps, as well as the dynamic\nexploitation of primary region masks. The proposed method can be seamlessly\nintegrated into different backbones without affecting their inference\nefficiency. We conduct extensive experiments over three widely used deepfake\ndatasets - DFDC, DF-1.0, and Celeb-DF with five backbones. Our method\ndemonstrates an average performance improvement of 6% across different\nbackbones and performs competitively with several state-of-the-art baselines.\n","authors":["Harry Cheng","Yangyang Guo","Tianyi Wang","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2307.12534v1.pdf","comment":"12 pages. Code and Dataset: https://github.com/xaCheng1996/PRLE"},{"id":"http://arxiv.org/abs/2307.12532v1","updated":"2023-07-24T05:36:19Z","published":"2023-07-24T05:36:19Z","title":"On the Connection between Pre-training Data Diversity and Fine-tuning\n  Robustness","summary":"  Pre-training has been widely adopted in deep learning to improve model\nperformance, especially when the training data for a target task is limited. In\nour work, we seek to understand the implications of this training strategy on\nthe generalization properties of downstream models. More specifically, we ask\nthe following question: how do properties of the pre-training distribution\naffect the robustness of a fine-tuned model? The properties we explore include\nthe label space, label semantics, image diversity, data domains, and data\nquantity of the pre-training distribution. We find that the primary factor\ninfluencing downstream effective robustness (Taori et al., 2020) is data\nquantity, while other factors have limited significance. For example, reducing\nthe number of ImageNet pre-training classes by 4x while increasing the number\nof images per class by 4x (that is, keeping total data quantity fixed) does not\nimpact the robustness of fine-tuned models. We demonstrate our findings on\npre-training distributions drawn from various natural and synthetic data\nsources, primarily using the iWildCam-WILDS distribution shift as a test for\ndownstream robustness.\n","authors":["Vivek Ramanujan","Thao Nguyen","Sewoong Oh","Ludwig Schmidt","Ali Farhadi"],"pdf_url":"https://arxiv.org/pdf/2307.12532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.18246v3","updated":"2023-07-24T05:35:30Z","published":"2023-03-31T17:59:09Z","title":"3D Human Pose Estimation via Intuitive Physics","summary":"  Estimating 3D humans from images often produces implausible bodies that lean,\nfloat, or penetrate the floor. Such methods ignore the fact that bodies are\ntypically supported by the scene. A physics engine can be used to enforce\nphysical plausibility, but these are not differentiable, rely on unrealistic\nproxy bodies, and are difficult to integrate into existing optimization and\nlearning frameworks. In contrast, we exploit novel intuitive-physics (IP) terms\nthat can be inferred from a 3D SMPL body interacting with the scene. Inspired\nby biomechanics, we infer the pressure heatmap on the body, the Center of\nPressure (CoP) from the heatmap, and the SMPL body's Center of Mass (CoM). With\nthese, we develop IPMAN, to estimate a 3D body from a color image in a \"stable\"\nconfiguration by encouraging plausible floor contact and overlapping CoP and\nCoM. Our IP terms are intuitive, easy to implement, fast to compute,\ndifferentiable, and can be integrated into existing optimization and regression\nmethods. We evaluate IPMAN on standard datasets and MoYo, a new dataset with\nsynchronized multi-view images, ground-truth 3D bodies with complex poses,\nbody-floor contact, CoM and pressure. IPMAN produces more plausible results\nthan the state of the art, improving accuracy for static poses, while not\nhurting dynamic ones. Code and data are available for research at\nhttps://ipman.is.tue.mpg.de.\n","authors":["Shashank Tripathi","Lea Müller","Chun-Hao P. Huang","Omid Taheri","Michael J. Black","Dimitrios Tzionas"],"pdf_url":"https://arxiv.org/pdf/2303.18246v3.pdf","comment":"Accepted in CVPR'23. Project page: https://ipman.is.tue.mpg.de"},{"id":"http://arxiv.org/abs/2307.12526v1","updated":"2023-07-24T04:56:23Z","published":"2023-07-24T04:56:23Z","title":"Rethinking Medical Report Generation: Disease Revealing Enhancement with\n  Knowledge Graph","summary":"  Knowledge Graph (KG) plays a crucial role in Medical Report Generation (MRG)\nbecause it reveals the relations among diseases and thus can be utilized to\nguide the generation process. However, constructing a comprehensive KG is\nlabor-intensive and its applications on the MRG process are under-explored. In\nthis study, we establish a complete KG on chest X-ray imaging that includes 137\ntypes of diseases and abnormalities. Based on this KG, we find that the current\nMRG data sets exhibit a long-tailed problem in disease distribution. To\nmitigate this problem, we introduce a novel augmentation strategy that enhances\nthe representation of disease types in the tail-end of the distribution. We\nfurther design a two-stage MRG approach, where a classifier is first trained to\ndetect whether the input images exhibit any abnormalities. The classified\nimages are then independently fed into two transformer-based generators,\nnamely, ``disease-specific generator\" and ``disease-free generator\" to generate\nthe corresponding reports. To enhance the clinical evaluation of whether the\ngenerated reports correctly describe the diseases appearing in the input image,\nwe propose diverse sensitivity (DS), a new metric that checks whether generated\ndiseases match ground truth and measures the diversity of all generated\ndiseases. Results show that the proposed two-stage generation framework and\naugmentation strategies improve DS by a considerable margin, indicating a\nnotable reduction in the long-tailed problem associated with under-represented\ndiseases.\n","authors":["Yixin Wang","Zihao Lin","Haoyu Dong"],"pdf_url":"https://arxiv.org/pdf/2307.12526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12517v1","updated":"2023-07-24T04:21:51Z","published":"2023-07-24T04:21:51Z","title":"Entropy Transformer Networks: A Learning Approach via Tangent Bundle\n  Data Manifold","summary":"  This paper focuses on an accurate and fast interpolation approach for image\ntransformation employed in the design of CNN architectures. Standard Spatial\nTransformer Networks (STNs) use bilinear or linear interpolation as their\ninterpolation, with unrealistic assumptions about the underlying data\ndistributions, which leads to poor performance under scale variations.\nMoreover, STNs do not preserve the norm of gradients in propagation due to\ntheir dependency on sparse neighboring pixels. To address this problem, a novel\nEntropy STN (ESTN) is proposed that interpolates on the data manifold\ndistributions. In particular, random samples are generated for each pixel in\nassociation with the tangent space of the data manifold and construct a linear\napproximation of their intensity values with an entropy regularizer to compute\nthe transformer parameters. A simple yet effective technique is also proposed\nto normalize the non-zero values of the convolution operation, to fine-tune the\nlayers for gradients' norm-regularization during training. Experiments on\nchallenging benchmarks show that the proposed ESTN can improve predictive\naccuracy over a range of computer vision tasks, including image reconstruction,\nand classification, while reducing the computational cost.\n","authors":["Pourya Shamsolmoali","Masoumeh Zareapoor"],"pdf_url":"https://arxiv.org/pdf/2307.12517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12539v2","updated":"2023-07-24T04:20:37Z","published":"2023-04-25T03:12:54Z","title":"Text-guided Eyeglasses Manipulation with Spatial Constraints","summary":"  Virtual try-on of eyeglasses involves placing eyeglasses of different shapes\nand styles onto a face image without physically trying them on. While existing\nmethods have shown impressive results, the variety of eyeglasses styles is\nlimited and the interactions are not always intuitive or efficient. To address\nthese limitations, we propose a Text-guided Eyeglasses Manipulation method that\nallows for control of the eyeglasses shape and style based on a binary mask and\ntext, respectively. Specifically, we introduce a mask encoder to extract mask\nconditions and a modulation module that enables simultaneous injection of text\nand mask conditions. This design allows for fine-grained control of the\neyeglasses' appearance based on both textual descriptions and spatial\nconstraints. Our approach includes a disentangled mapper and a decoupling\nstrategy that preserves irrelevant areas, resulting in better local editing. We\nemploy a two-stage training scheme to handle the different convergence speeds\nof the various modality conditions, successfully controlling both the shape and\nstyle of eyeglasses. Extensive comparison experiments and ablation analyses\ndemonstrate the effectiveness of our approach in achieving diverse eyeglasses\nstyles while preserving irrelevant areas.\n","authors":["Jiacheng Wang","Ping Liu","Jingen Liu","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2304.12539v2.pdf","comment":"Revised version: add some experiments"},{"id":"http://arxiv.org/abs/2307.11466v2","updated":"2023-07-24T03:35:03Z","published":"2023-07-21T10:02:02Z","title":"MatSpectNet: Material Segmentation Network with Domain-Aware and\n  Physically-Constrained Hyperspectral Reconstruction","summary":"  Achieving accurate material segmentation for 3-channel RGB images is\nchallenging due to the considerable variation in a material's appearance.\nHyperspectral images, which are sets of spectral measurements sampled at\nmultiple wavelengths, theoretically offer distinct information for material\nidentification, as variations in intensity of electromagnetic radiation\nreflected by a surface depend on the material composition of a scene. However,\nexisting hyperspectral datasets are impoverished regarding the number of images\nand material categories for the dense material segmentation task, and\ncollecting and annotating hyperspectral images with a spectral camera is\nprohibitively expensive. To address this, we propose a new model, the\nMatSpectNet to segment materials with recovered hyperspectral images from RGB\nimages. The network leverages the principles of colour perception in modern\ncameras to constrain the reconstructed hyperspectral images and employs the\ndomain adaptation method to generalise the hyperspectral reconstruction\ncapability from a spectral recovery dataset to material segmentation datasets.\nThe reconstructed hyperspectral images are further filtered using learned\nresponse curves and enhanced with human perception. The performance of\nMatSpectNet is evaluated on the LMD dataset as well as the OpenSurfaces\ndataset. Our experiments demonstrate that MatSpectNet attains a 1.60% increase\nin average pixel accuracy and a 3.42% improvement in mean class accuracy\ncompared with the most recent publication. The project code is attached to the\nsupplementary material and will be published on GitHub.\n","authors":["Yuwen Heng","Yihong Wu","Jiawen Chen","Srinandan Dasmahapatra","Hansung Kim"],"pdf_url":"https://arxiv.org/pdf/2307.11466v2.pdf","comment":"7 pages main paper"},{"id":"http://arxiv.org/abs/2304.03483v2","updated":"2023-07-24T03:28:34Z","published":"2023-04-07T05:29:59Z","title":"RED-PSM: Regularization by Denoising of Partially Separable Models for\n  Dynamic Imaging","summary":"  Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at\neach time instant using its undersampled measurements. In particular, in the\ncase of dynamic tomography, only a single projection at a single view angle may\nbe available at a time, making the problem severely ill-posed. In this work, we\npropose an approach, RED-PSM, which combines for the first time two powerful\ntechniques to address this challenging imaging problem. The first, are\npartially separable models, which have been used to efficiently introduce a\nlow-rank prior for the spatio-temporal object. The second is the recent\nRegularization by Denoising (RED), which provides a flexible framework to\nexploit the impressive performance of state-of-the-art image denoising\nalgorithms, for various inverse problems. We propose a partially separable\nobjective with RED and a computationally efficient and scalable optimization\nscheme with variable splitting and ADMM. Theoretical analysis proves the\nconvergence of our objective to a value corresponding to a stationary point\nsatisfying the first-order optimality conditions. Convergence is accelerated by\na particular projection-domain-based initialization. We demonstrate the\nperformance and computational improvements of our proposed RED-PSM with a\nlearned image denoiser by comparing it to a recent deep-prior-based method\nknown as TD-DIP. Although the main focus is on dynamic tomography, we also show\nthe performance advantages of RED-PSM in a cardiac dynamic MRI setting.\n","authors":["Berk Iskender","Marc L. Klasky","Yoram Bresler"],"pdf_url":"https://arxiv.org/pdf/2304.03483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12502v1","updated":"2023-07-24T03:27:41Z","published":"2023-07-24T03:27:41Z","title":"Cross Contrastive Feature Perturbation for Domain Generalization","summary":"  Domain generalization (DG) aims to learn a robust model from source domains\nthat generalize well on unseen target domains. Recent studies focus on\ngenerating novel domain samples or features to diversify distributions\ncomplementary to source domains. Yet, these approaches can hardly deal with the\nrestriction that the samples synthesized from various domains can cause\nsemantic distortion. In this paper, we propose an online one-stage Cross\nContrasting Feature Perturbation (CCFP) framework to simulate domain shift by\ngenerating perturbed features in the latent space while regularizing the model\nprediction against domain shift. Different from the previous fixed synthesizing\nstrategy, we design modules with learnable feature perturbations and semantic\nconsistency constraints. In contrast to prior work, our method does not use any\ngenerative-based models or domain labels. We conduct extensive experiments on a\nstandard DomainBed benchmark with a strict evaluation protocol for a fair\ncomparison. Comprehensive experiments show that our method outperforms the\nprevious state-of-the-art, and quantitative analyses illustrate that our\napproach can alleviate the domain shift problem in out-of-distribution (OOD)\nscenarios.\n","authors":["Chenming Li","Daoan Zhang","Wenjian Huang","Jianguo Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.09186v4","updated":"2023-07-24T03:20:19Z","published":"2022-04-20T02:14:20Z","title":"Reconstruction-Aware Prior Distillation for Semi-supervised Point Cloud\n  Completion","summary":"  Real-world sensors often produce incomplete, irregular, and noisy point\nclouds, making point cloud completion increasingly important. However, most\nexisting completion methods rely on large paired datasets for training, which\nis labor-intensive. This paper proposes RaPD, a novel semi-supervised point\ncloud completion method that reduces the need for paired datasets. RaPD\nutilizes a two-stage training scheme, where a deep semantic prior is learned in\nstage 1 from unpaired complete and incomplete point clouds, and a\nsemi-supervised prior distillation process is introduced in stage 2 to train a\ncompletion network using only a small number of paired samples. Additionally, a\nself-supervised completion module is introduced to improve performance using\nunpaired incomplete point clouds. Experiments on multiple datasets show that\nRaPD outperforms previous methods in both homologous and heterologous\nscenarios.\n","authors":["Zhaoxin Fan","Yulin He","Zhicheng Wang","Kejian Wu","Hongyan Liu","Jun He"],"pdf_url":"https://arxiv.org/pdf/2204.09186v4.pdf","comment":"Accepted to IJCAI 2023"},{"id":"http://arxiv.org/abs/2307.12499v1","updated":"2023-07-24T03:10:02Z","published":"2023-07-24T03:10:02Z","title":"AdvDiff: Generating Unrestricted Adversarial Examples using Diffusion\n  Models","summary":"  Unrestricted adversarial attacks present a serious threat to deep learning\nmodels and adversarial defense techniques. They pose severe security problems\nfor deep learning applications because they can effectively bypass defense\nmechanisms. However, previous attack methods often utilize Generative\nAdversarial Networks (GANs), which are not theoretically provable and thus\ngenerate unrealistic examples by incorporating adversarial objectives,\nespecially for large-scale datasets like ImageNet. In this paper, we propose a\nnew method, called AdvDiff, to generate unrestricted adversarial examples with\ndiffusion models. We design two novel adversarial guidance techniques to\nconduct adversarial sampling in the reverse generation process of diffusion\nmodels. These two techniques are effective and stable to generate high-quality,\nrealistic adversarial examples by integrating gradients of the target\nclassifier interpretably. Experimental results on MNIST and ImageNet datasets\ndemonstrate that AdvDiff is effective to generate unrestricted adversarial\nexamples, which outperforms GAN-based methods in terms of attack performance\nand generation quality.\n","authors":["Xuelong Dai","Kaisheng Liang","Bin Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.12499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.09417v2","updated":"2023-07-24T03:06:15Z","published":"2022-08-19T16:04:29Z","title":"Target-oriented Sentiment Classification with Sequential Cross-modal\n  Semantic Graph","summary":"  Multi-modal aspect-based sentiment classification (MABSC) is task of\nclassifying the sentiment of a target entity mentioned in a sentence and an\nimage. However, previous methods failed to account for the fine-grained\nsemantic association between the image and the text, which resulted in limited\nidentification of fine-grained image aspects and opinions. To address these\nlimitations, in this paper we propose a new approach called SeqCSG, which\nenhances the encoder-decoder sentiment classification framework using\nsequential cross-modal semantic graphs. SeqCSG utilizes image captions and\nscene graphs to extract both global and local fine-grained image information\nand considers them as elements of the cross-modal semantic graph along with\ntokens from tweets. The sequential cross-modal semantic graph is represented as\na sequence with a multi-modal adjacency matrix indicating relationships between\nelements. Experimental results show that the approach outperforms existing\nmethods and achieves state-of-the-art performance on two standard datasets.\nFurther analysis has demonstrated that the model can implicitly learn the\ncorrelation between fine-grained information of the image and the text with the\ngiven target. Our code is available at https://github.com/zjukg/SeqCSG.\n","authors":["Yufeng Huang","Zhuo Chen","Jiaoyan Chen","Jeff Z. Pan","Zhen Yao","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2208.09417v2.pdf","comment":"ICANN 2023, https://github.com/zjukg/SeqCSG"},{"id":"http://arxiv.org/abs/2307.11411v2","updated":"2023-07-24T02:57:01Z","published":"2023-07-21T08:10:26Z","title":"Deep Directly-Trained Spiking Neural Networks for Object Detection","summary":"  Spiking neural networks (SNNs) are brain-inspired energy-efficient models\nthat encode information in spatiotemporal dynamics. Recently, deep SNNs trained\ndirectly have shown great success in achieving high performance on\nclassification tasks with very few time steps. However, how to design a\ndirectly-trained SNN for the regression task of object detection still remains\na challenging problem. To address this problem, we propose EMS-YOLO, a novel\ndirectly-trained SNN framework for object detection, which is the first trial\nto train a deep SNN with surrogate gradients for object detection rather than\nANN-SNN conversion strategies. Specifically, we design a full-spike residual\nblock, EMS-ResNet, which can effectively extend the depth of the\ndirectly-trained SNN with low power consumption. Furthermore, we theoretically\nanalyze and prove the EMS-ResNet could avoid gradient vanishing or exploding.\nThe results demonstrate that our approach outperforms the state-of-the-art\nANN-SNN conversion methods (at least 500 time steps) in extremely fewer time\nsteps (only 4 time steps). It is shown that our model could achieve comparable\nperformance to the ANN with the same architecture while consuming 5.83 times\nless energy on the frame-based COCO Dataset and the event-based Gen1 Dataset.\n","authors":["Qiaoyi Su","Yuhong Chou","Yifan Hu","Jianing Li","Shijie Mei","Ziyang Zhang","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2307.11411v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.12493v1","updated":"2023-07-24T02:50:44Z","published":"2023-07-24T02:50:44Z","title":"TF-ICON: Diffusion-Based Training-Free Cross-Domain Image Composition","summary":"  Text-driven diffusion models have exhibited impressive generative\ncapabilities, enabling various image editing tasks. In this paper, we propose\nTF-ICON, a novel Training-Free Image COmpositioN framework that harnesses the\npower of text-driven diffusion models for cross-domain image-guided\ncomposition. This task aims to seamlessly integrate user-provided objects into\na specific visual context. Current diffusion-based methods often involve costly\ninstance-based optimization or finetuning of pretrained models on customized\ndatasets, which can potentially undermine their rich prior. In contrast,\nTF-ICON can leverage off-the-shelf diffusion models to perform cross-domain\nimage-guided composition without requiring additional training, finetuning, or\noptimization. Moreover, we introduce the exceptional prompt, which contains no\ninformation, to facilitate text-driven diffusion models in accurately inverting\nreal images into latent representations, forming the basis for compositing. Our\nexperiments show that equipping Stable Diffusion with the exceptional prompt\noutperforms state-of-the-art inversion methods on various datasets (CelebA-HQ,\nCOCO, and ImageNet), and that TF-ICON surpasses prior baselines in versatile\nvisual domains. Code is available at https://github.com/Shilin-LU/TF-ICON\n","authors":["Shilin Lu","Yanzhu Liu","Adams Wai-Kin Kong"],"pdf_url":"https://arxiv.org/pdf/2307.12493v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.00932v2","updated":"2023-07-24T01:57:52Z","published":"2023-07-03T11:13:28Z","title":"A large calcium-imaging dataset reveals a systematic V4 organization for\n  natural scenes","summary":"  The visual system evolved to process natural scenes, yet most of our\nunderstanding of the topology and function of visual cortex derives from\nstudies using artificial stimuli. To gain deeper insights into visual\nprocessing of natural scenes, we utilized widefield calcium-imaging of primate\nV4 in response to many natural images, generating a large dataset of\ncolumnar-scale responses. We used this dataset to build a digital twin of V4\nvia deep learning, generating a detailed topographical map of natural image\npreferences at each cortical position. The map revealed clustered functional\ndomains for specific classes of natural image features. These ranged from\nsurface-related attributes like color and texture to shape-related features\nsuch as edges, curvature, and facial features. We validated the model-predicted\ndomains with additional widefield calcium-imaging and single-cell resolution\ntwo-photon imaging. Our study illuminates the detailed topological organization\nand neural codes in V4 that represent natural scenes.\n","authors":["Tianye Wang","Haoxuan Yao","Tai Sing Lee","Jiayi Hong","Yang Li","Hongfei Jiang","Ian Max Andolina","Shiming Tang"],"pdf_url":"https://arxiv.org/pdf/2307.00932v2.pdf","comment":"39 pages, 14 figures"},{"id":"http://arxiv.org/abs/2305.01788v3","updated":"2023-07-24T00:54:51Z","published":"2023-05-02T21:33:10Z","title":"Vision Meets Definitions: Unsupervised Visual Word Sense Disambiguation\n  Incorporating Gloss Information","summary":"  Visual Word Sense Disambiguation (VWSD) is a task to find the image that most\naccurately depicts the correct sense of the target word for the given context.\nPreviously, image-text matching models often suffered from recognizing\npolysemous words. This paper introduces an unsupervised VWSD approach that uses\ngloss information of an external lexical knowledge-base, especially the sense\ndefinitions. Specifically, we suggest employing Bayesian inference to\nincorporate the sense definitions when sense information of the answer is not\nprovided. In addition, to ameliorate the out-of-dictionary (OOD) issue, we\npropose a context-aware definition generation with GPT-3. Experimental results\nshow that the VWSD performance significantly increased with our Bayesian\ninference-based approach. In addition, our context-aware definition generation\nachieved prominent performance improvement in OOD examples exhibiting better\nperformance than the existing definition generation method.\n","authors":["Sunjae Kwon","Rishabh Garodia","Minhwa Lee","Zhichao Yang","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2305.01788v3.pdf","comment":"ACL 2023, https://aclanthology.org/2023.acl-long.88"},{"id":"http://arxiv.org/abs/2307.12463v1","updated":"2023-07-24T00:53:46Z","published":"2023-07-24T00:53:46Z","title":"Rethinking Data Distillation: Do Not Overlook Calibration","summary":"  Neural networks trained on distilled data often produce over-confident output\nand require correction by calibration methods. Existing calibration methods\nsuch as temperature scaling and mixup work well for networks trained on\noriginal large-scale data. However, we find that these methods fail to\ncalibrate networks trained on data distilled from large source datasets. In\nthis paper, we show that distilled data lead to networks that are not\ncalibratable due to (i) a more concentrated distribution of the maximum logits\nand (ii) the loss of information that is semantically meaningful but unrelated\nto classification tasks. To address this problem, we propose Masked Temperature\nScaling (MTS) and Masked Distillation Training (MDT) which mitigate the\nlimitations of distilled data and achieve better calibration results while\nmaintaining the efficiency of dataset distillation.\n","authors":["Dongyao Zhu","Bowen Lei","Jie Zhang","Yanbo Fang","Ruqi Zhang","Yiqun Xie","Dongkuan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.12463v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2304.07916v2","updated":"2023-07-24T00:29:45Z","published":"2023-04-16T23:37:24Z","title":"GaitRef: Gait Recognition with Refined Sequential Skeletons","summary":"  Identifying humans with their walking sequences, known as gait recognition,\nis a useful biometric understanding task as it can be observed from a long\ndistance and does not require cooperation from the subject. Two common\nmodalities used for representing the walking sequence of a person are\nsilhouettes and joint skeletons. Silhouette sequences, which record the\nboundary of the walking person in each frame, may suffer from the variant\nappearances from carried-on objects and clothes of the person. Framewise joint\ndetections are noisy and introduce some jitters that are not consistent with\nsequential detections. In this paper, we combine the silhouettes and skeletons\nand refine the framewise joint predictions for gait recognition. With temporal\ninformation from the silhouette sequences. We show that the refined skeletons\ncan improve gait recognition performance without extra annotations. We compare\nour methods on four public datasets, CASIA-B, OUMVLP, Gait3D and GREW, and show\nstate-of-the-art performance.\n","authors":["Haidong Zhu","Wanrong Zheng","Zhaoheng Zheng","Ram Nevatia"],"pdf_url":"https://arxiv.org/pdf/2304.07916v2.pdf","comment":"IJCB 2023. Code is available at\n  https://github.com/haidongz-usc/GaitRef"},{"id":"http://arxiv.org/abs/2307.12459v1","updated":"2023-07-24T00:03:09Z","published":"2023-07-24T00:03:09Z","title":"Robust face anti-spoofing framework with Convolutional Vision\n  Transformer","summary":"  Owing to the advances in image processing technology and large-scale\ndatasets, companies have implemented facial authentication processes, thereby\nstimulating increased focus on face anti-spoofing (FAS) against realistic\npresentation attacks. Recently, various attempts have been made to improve face\nrecognition performance using both global and local learning on face images;\nhowever, to the best of our knowledge, this is the first study to investigate\nwhether the robustness of FAS against domain shifts is improved by considering\nglobal information and local cues in face images captured using self-attention\nand convolutional layers. This study proposes a convolutional vision\ntransformer-based framework that achieves robust performance for various unseen\ndomain data. Our model resulted in 7.3%$p$ and 12.9%$p$ increases in FAS\nperformance compared to models using only a convolutional neural network or\nvision transformer, respectively. It also shows the highest average rank in\nsub-protocols of cross-dataset setting over the other nine benchmark models for\ndomain generalization.\n","authors":["Yunseung Lee","Youngjun Kwak","Jinho Shin"],"pdf_url":"https://arxiv.org/pdf/2307.12459v1.pdf","comment":"ICIP 2023"},{"id":"http://arxiv.org/abs/2301.06363v2","updated":"2023-07-24T23:39:15Z","published":"2023-01-16T11:17:32Z","title":"A$^2$-UAV: Application-Aware Content and Network Optimization of\n  Edge-Assisted UAV Systems","summary":"  To perform advanced surveillance, Unmanned Aerial Vehicles (UAVs) require the\nexecution of edge-assisted computer vision (CV) tasks. In multi-hop UAV\nnetworks, the successful transmission of these tasks to the edge is severely\nchallenged due to severe bandwidth constraints. For this reason, we propose a\nnovel A$^2$-UAV framework to optimize the number of correctly executed tasks at\nthe edge. In stark contrast with existing art, we take an application-aware\napproach and formulate a novel pplication-Aware Task Planning Problem\n(A$^2$-TPP) that takes into account (i) the relationship between deep neural\nnetwork (DNN) accuracy and image compression for the classes of interest based\non the available dataset, (ii) the target positions, (iii) the current\nenergy/position of the UAVs to optimize routing, data pre-processing and target\nassignment for each UAV. We demonstrate A$^2$-TPP is NP-Hard and propose a\npolynomial-time algorithm to solve it efficiently. We extensively evaluate\nA$^2$-UAV through real-world experiments with a testbed composed by four DJI\nMavic Air 2 UAVs. We consider state-of-the-art image classification tasks with\nfour different DNN models (i.e., DenseNet, ResNet152, ResNet50 and\nMobileNet-V2) and object detection tasks using YoloV4 trained on the ImageNet\ndataset. Results show that A$^2$-UAV attains on average around 38% more\naccomplished tasks than the state-of-the-art, with 400% more accomplished tasks\nwhen the number of targets increases significantly. To allow full\nreproducibility, we pledge to share datasets and code with the research\ncommunity.\n","authors":["Andrea Coletta","Flavio Giorgi","Gaia Maselli","Matteo Prata","Domenicomichele Silvestri","Jonathan Ashdown","Francesco Restuccia"],"pdf_url":"https://arxiv.org/pdf/2301.06363v2.pdf","comment":"Accepted to INFOCOM 2023"},{"id":"http://arxiv.org/abs/2307.13136v1","updated":"2023-07-24T21:29:48Z","published":"2023-07-24T21:29:48Z","title":"Does Progress On Object Recognition Benchmarks Improve Real-World\n  Generalization?","summary":"  For more than a decade, researchers have measured progress in object\nrecognition on ImageNet-based generalization benchmarks such as ImageNet-A, -C,\nand -R. Recent advances in foundation models, trained on orders of magnitude\nmore data, have begun to saturate these standard benchmarks, but remain brittle\nin practice. This suggests standard benchmarks, which tend to focus on\npredefined or synthetic changes, may not be sufficient for measuring real world\ngeneralization. Consequently, we propose studying generalization across\ngeography as a more realistic measure of progress using two datasets of objects\nfrom households across the globe. We conduct an extensive empirical evaluation\nof progress across nearly 100 vision models up to most recent foundation\nmodels. We first identify a progress gap between standard benchmarks and\nreal-world, geographical shifts: progress on ImageNet results in up to 2.5x\nmore progress on standard generalization benchmarks than real-world\ndistribution shifts. Second, we study model generalization across geographies\nby measuring the disparities in performance across regions, a more fine-grained\nmeasure of real world generalization. We observe all models have large\ngeographic disparities, even foundation CLIP models, with differences of 7-20%\nin accuracy between regions. Counter to modern intuition, we discover progress\non standard benchmarks fails to improve geographic disparities and often\nexacerbates them: geographic disparities between the least performant models\nand today's best models have more than tripled. Our results suggest scaling\nalone is insufficient for consistent robustness to real-world distribution\nshifts. Finally, we highlight in early experiments how simple last layer\nretraining on more representative, curated data can complement scaling as a\npromising direction of future work, reducing geographic disparity on both\nbenchmarks by over two-thirds.\n","authors":["Megan Richards","Polina Kirichenko","Diane Bouchacourt","Mark Ibrahim"],"pdf_url":"https://arxiv.org/pdf/2307.13136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13133v1","updated":"2023-07-24T21:22:58Z","published":"2023-07-24T21:22:58Z","title":"simPLE: a visuotactile method learned in simulation to precisely pick,\n  localize, regrasp, and place objects","summary":"  Existing robotic systems have a clear tension between generality and\nprecision. Deployed solutions for robotic manipulation tend to fall into the\nparadigm of one robot solving a single task, lacking precise generalization,\ni.e., the ability to solve many tasks without compromising on precision. This\npaper explores solutions for precise and general pick-and-place. In precise\npick-and-place, i.e. kitting, the robot transforms an unstructured arrangement\nof objects into an organized arrangement, which can facilitate further\nmanipulation. We propose simPLE (simulation to Pick Localize and PLacE) as a\nsolution to precise pick-and-place. simPLE learns to pick, regrasp and place\nobjects precisely, given only the object CAD model and no prior experience. We\ndevelop three main components: task-aware grasping, visuotactile perception,\nand regrasp planning. Task-aware grasping computes affordances of grasps that\nare stable, observable, and favorable to placing. The visuotactile perception\nmodel relies on matching real observations against a set of simulated ones\nthrough supervised learning. Finally, we compute the desired robot motion by\nsolving a shortest path problem on a graph of hand-to-hand regrasps. On a\ndual-arm robot equipped with visuotactile sensing, we demonstrate\npick-and-place of 15 diverse objects with simPLE. The objects span a wide range\nof shapes and simPLE achieves successful placements into structured\narrangements with 1mm clearance over 90% of the time for 6 objects, and over\n80% of the time for 11 objects. Videos are available at\nhttp://mcube.mit.edu/research/simPLE.html .\n","authors":["Maria Bauza","Antonia Bronars","Yifan Hou","Ian Taylor","Nikhil Chavan-Dafle","Alberto Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2307.13133v1.pdf","comment":"33 pages, 6 figures, 2 tables, submitted to Science Robotics"},{"id":"http://arxiv.org/abs/2205.04691v3","updated":"2023-07-24T20:56:50Z","published":"2022-05-10T06:24:09Z","title":"An Asynchronous Event-Based Algorithm for Periodic Signals","summary":"  Let $0\\leq\\tau_{1}\\leq\\tau_{2}\\leq\\cdots\\leq\\tau_{m}\\leq1$, originated from a\nuniform distribution. Let also $\\epsilon,\\delta\\in\\mathbb{R}$, and\n$d\\in\\mathbb{N}$. What is the probability of having more than $d$ adjacent\n$\\tau_{i}$-s pairs that the distance between them is $\\delta$, up to an error\n$\\epsilon$ ? In this paper we are going to show how this untreated theoretical\nprobabilistic problem arises naturally from the motivation of analyzing a\nsimple asynchronous algorithm for detection of signals with a known frequency,\nusing the novel technology of an event camera.\n","authors":["David El-Chai Ben-Ezra","Ron Arad","Ayelet Padowicz","Israel Tugendhaft"],"pdf_url":"https://arxiv.org/pdf/2205.04691v3.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2307.13125v1","updated":"2023-07-24T20:53:59Z","published":"2023-07-24T20:53:59Z","title":"Deep Learning Approaches for Data Augmentation in Medical Imaging: A\n  Review","summary":"  Deep learning has become a popular tool for medical image analysis, but the\nlimited availability of training data remains a major challenge, particularly\nin the medical field where data acquisition can be costly and subject to\nprivacy regulations. Data augmentation techniques offer a solution by\nartificially increasing the number of training samples, but these techniques\noften produce limited and unconvincing results. To address this issue, a\ngrowing number of studies have proposed the use of deep generative models to\ngenerate more realistic and diverse data that conform to the true distribution\nof the data. In this review, we focus on three types of deep generative models\nfor medical image augmentation: variational autoencoders, generative\nadversarial networks, and diffusion models. We provide an overview of the\ncurrent state of the art in each of these models and discuss their potential\nfor use in different downstream tasks in medical imaging, including\nclassification, segmentation, and cross-modal translation. We also evaluate the\nstrengths and limitations of each model and suggest directions for future\nresearch in this field. Our goal is to provide a comprehensive review about the\nuse of deep generative models for medical image augmentation and to highlight\nthe potential of these models for improving the performance of deep learning\nalgorithms in medical image analysis.\n","authors":["Aghiles Kebaili","Jérôme Lapuyade-Lahorgue","Su Ruan"],"pdf_url":"https://arxiv.org/pdf/2307.13125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13110v1","updated":"2023-07-24T19:59:15Z","published":"2023-07-24T19:59:15Z","title":"Automatic Infant Respiration Estimation from Video: A Deep Flow-based\n  Algorithm and a Novel Public Benchmark","summary":"  Respiration is a critical vital sign for infants, and continuous respiratory\nmonitoring is particularly important for newborns. However, neonates are\nsensitive and contact-based sensors present challenges in comfort, hygiene, and\nskin health, especially for preterm babies. As a step toward fully automatic,\ncontinuous, and contactless respiratory monitoring, we develop a deep-learning\nmethod for estimating respiratory rate and waveform from plain video footage in\nnatural settings. Our automated infant respiration flow-based network\n(AIRFlowNet) combines video-extracted optical flow input and spatiotemporal\nconvolutional processing tuned to the infant domain. We support our model with\nthe first public annotated infant respiration dataset with 125 videos\n(AIR-125), drawn from eight infant subjects, set varied pose, lighting, and\ncamera conditions. We include manual respiration annotations and optimize\nAIRFlowNet training on them using a novel spectral bandpass loss function. When\ntrained and tested on the AIR-125 infant data, our method significantly\noutperforms other state-of-the-art methods in respiratory rate estimation,\nachieving a mean absolute error of $\\sim$2.9 breaths per minute, compared to\n$\\sim$4.7--6.2 for other public models designed for adult subjects and more\nuniform environments.\n","authors":["Sai Kumar Reddy Manne","Shaotong Zhu","Sarah Ostadabbas","Michael Wan"],"pdf_url":"https://arxiv.org/pdf/2307.13110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05799v2","updated":"2023-07-24T19:13:20Z","published":"2023-07-11T20:46:19Z","title":"3D Medical Image Segmentation based on multi-scale MPU-Net","summary":"  The high cure rate of cancer is inextricably linked to physicians' accuracy\nin diagnosis and treatment, therefore a model that can accomplish\nhigh-precision tumor segmentation has become a necessity in many applications\nof the medical industry. It can effectively lower the rate of misdiagnosis\nwhile considerably lessening the burden on clinicians. However, fully automated\ntarget organ segmentation is problematic due to the irregular stereo structure\nof 3D volume organs. As a basic model for this class of real applications,\nU-Net excels. It can learn certain global and local features, but still lacks\nthe capacity to grasp spatial long-range relationships and contextual\ninformation at multiple scales. This paper proposes a tumor segmentation model\nMPU-Net for patient volume CT images, which is inspired by Transformer with a\nglobal attention mechanism. By combining image serialization with the Position\nAttention Module, the model attempts to comprehend deeper contextual\ndependencies and accomplish precise positioning. Each layer of the decoder is\nalso equipped with a multi-scale module and a cross-attention mechanism. The\ncapability of feature extraction and integration at different levels has been\nenhanced, and the hybrid loss function developed in this study can better\nexploit high-resolution characteristic information. Moreover, the suggested\narchitecture is tested and evaluated on the Liver Tumor Segmentation Challenge\n2017 (LiTS 2017) dataset. Compared with the benchmark model U-Net, MPU-Net\nshows excellent segmentation results. The dice, accuracy, precision,\nspecificity, IOU, and MCC metrics for the best model segmentation results are\n92.17%, 99.08%, 91.91%, 99.52%, 85.91%, and 91.74%, respectively. Outstanding\nindicators in various aspects illustrate the exceptional performance of this\nframework in automatic medical image segmentation.\n","authors":["Zeqiu. Yu","Shuo. Han","Ziheng. Song"],"pdf_url":"https://arxiv.org/pdf/2307.05799v2.pdf","comment":"37 pages"},{"id":"http://arxiv.org/abs/2307.13078v1","updated":"2023-07-24T18:59:46Z","published":"2023-07-24T18:59:46Z","title":"Adaptive Certified Training: Towards Better Accuracy-Robustness\n  Tradeoffs","summary":"  As deep learning models continue to advance and are increasingly utilized in\nreal-world systems, the issue of robustness remains a major challenge. Existing\ncertified training methods produce models that achieve high provable robustness\nguarantees at certain perturbation levels. However, the main problem of such\nmodels is a dramatically low standard accuracy, i.e. accuracy on clean\nunperturbed data, that makes them impractical. In this work, we consider a more\nrealistic perspective of maximizing the robustness of a model at certain levels\nof (high) standard accuracy. To this end, we propose a novel certified training\nmethod based on a key insight that training with adaptive certified radii helps\nto improve both the accuracy and robustness of the model, advancing\nstate-of-the-art accuracy-robustness tradeoffs. We demonstrate the\neffectiveness of the proposed method on MNIST, CIFAR-10, and TinyImageNet\ndatasets. Particularly, on CIFAR-10 and TinyImageNet, our method yields models\nwith up to two times higher robustness, measured as an average certified radius\nof a test set, at the same levels of standard accuracy compared to baseline\napproaches.\n","authors":["Zhakshylyk Nurlanov","Frank R. Schmidt","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2307.13078v1.pdf","comment":"Presented at ICML 2023 workshop \"New Frontiers in Adversarial Machine\n  Learning\""},{"id":"http://arxiv.org/abs/2307.09588v2","updated":"2023-07-24T18:52:54Z","published":"2023-07-18T19:51:28Z","title":"Automating Wood Species Detection and Classification in Microscopic\n  Images of Fibrous Materials with Deep Learning","summary":"  We have developed a methodology for the systematic generation of a large\nimage dataset of macerated wood references, which we used to generate image\ndata for nine hardwood genera. This is the basis for a substantial approach to\nautomate, for the first time, the identification of hardwood species in\nmicroscopic images of fibrous materials by deep learning. Our methodology\nincludes a flexible pipeline for easy annotation of vessel elements. We compare\nthe performance of different neural network architectures and hyperparameters.\nOur proposed method performs similarly well to human experts. In the future,\nthis will improve controls on global wood fiber product flows to protect\nforests.\n","authors":["Lars Nieradzik","Jördis Sieburg-Rockel","Stephanie Helmling","Janis Keuper","Thomas Weibel","Andrea Olbrich","Henrike Stephani"],"pdf_url":"https://arxiv.org/pdf/2307.09588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13069v1","updated":"2023-07-24T18:50:49Z","published":"2023-07-24T18:50:49Z","title":"General-Purpose Multi-Modal OOD Detection Framework","summary":"  Out-of-distribution (OOD) detection identifies test samples that differ from\nthe training data, which is critical to ensuring the safety and reliability of\nmachine learning (ML) systems. While a plethora of methods have been developed\nto detect uni-modal OOD samples, only a few have focused on multi-modal OOD\ndetection. Current contrastive learning-based methods primarily study\nmulti-modal OOD detection in a scenario where both a given image and its\ncorresponding textual description come from a new domain. However, real-world\ndeployments of ML systems may face more anomaly scenarios caused by multiple\nfactors like sensor faults, bad weather, and environmental changes. Hence, the\ngoal of this work is to simultaneously detect from multiple different OOD\nscenarios in a fine-grained manner. To reach this goal, we propose a\ngeneral-purpose weakly-supervised OOD detection framework, called WOOD, that\ncombines a binary classifier and a contrastive learning component to reap the\nbenefits of both. In order to better distinguish the latent representations of\nin-distribution (ID) and OOD samples, we adopt the Hinge loss to constrain\ntheir similarity. Furthermore, we develop a new scoring metric to integrate the\nprediction results from both the binary classifier and contrastive learning for\nidentifying OOD samples. We evaluate the proposed WOOD model on multiple\nreal-world datasets, and the experimental results demonstrate that the WOOD\nmodel outperforms the state-of-the-art methods for multi-modal OOD detection.\nImportantly, our approach is able to achieve high accuracy in OOD detection in\nthree different OOD scenarios simultaneously. The source code will be made\npublicly available upon publication.\n","authors":["Viet Duong","Qiong Wu","Zhengyi Zhou","Eric Zavesky","Jiahe Chen","Xiangzhou Liu","Wen-Ling Hsu","Huajie Shao"],"pdf_url":"https://arxiv.org/pdf/2307.13069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13060v1","updated":"2023-07-24T18:19:39Z","published":"2023-07-24T18:19:39Z","title":"On the characteristics of natural hydraulic dampers: An image-based\n  approach to study the fluid flow behaviour inside the human meniscal tissue","summary":"  The meniscal tissue is a layered material with varying properties influenced\nby collagen content and arrangement. Understanding the relationship between\nstructure and properties is crucial for disease management, treatment\ndevelopment, and biomaterial design. The internal layer of the meniscus is\nsofter and more deformable than the outer layers, thanks to interconnected\ncollagen channels that guide fluid flow. To investigate these relationships, we\npropose a novel approach that combines Computational Fluid Dynamics (CFD) with\nImage Analysis (CFD-IA). We analyze fluid flow in the internal architecture of\nthe human meniscus across a range of inlet velocities (0.1mm/s to 1.6m/s) using\nhigh-resolution 3D micro-computed tomography scans. Statistical correlations\nare observed between architectural parameters (tortuosity, connectivity,\nporosity, pore size) and fluid flow parameters (Re number distribution,\npermeability). Some channels exhibit Re values of 1400 at an inlet velocity of\n1.6m/s, and a transition from Darcy's regime to a non-Darcian regime occurs\naround an inlet velocity of 0.02m/s. Location-dependent permeability ranges\nfrom 20-32 Darcy. Regression modelling reveals a strong correlation between\nfluid velocity and tortuosity at high inlet velocities, as well as with channel\ndiameter at low inlet velocities. At higher inlet velocities, flow paths\ndeviate more from the preferential direction, resulting in a decrease in the\nconcentration parameter by an average of 0.4. This research provides valuable\ninsights into the fluid flow behaviour within the meniscus and its structural\ninfluences.\n","authors":["J. Waghorne","F. P. Bonomo","A. Rabbani","D. Bell","O. Barrera"],"pdf_url":"https://arxiv.org/pdf/2307.13060v1.pdf","comment":"20 Pages, 5 Figures"},{"id":"http://arxiv.org/abs/2307.02625v2","updated":"2023-07-24T18:16:38Z","published":"2023-07-05T19:56:50Z","title":"Retinex-based Image Denoising / Contrast Enhancement using Gradient\n  Graph Laplacian Regularizer","summary":"  Images captured in poorly lit conditions are often corrupted by acquisition\nnoise. Leveraging recent advances in graph-based regularization, we propose a\nfast Retinex-based restoration scheme that denoises and contrast-enhances an\nimage. Specifically, by Retinex theory we first assume that each image pixel is\na multiplication of its reflectance and illumination components. We next assume\nthat the reflectance and illumination components are piecewise constant (PWC)\nand continuous piecewise planar (PWP) signals, which can be recovered via graph\nLaplacian regularizer (GLR) and gradient graph Laplacian regularizer (GGLR)\nrespectively. We formulate quadratic objectives regularized by GLR and GGLR,\nwhich are minimized alternately until convergence by solving linear systems --\nwith improved condition numbers via proposed preconditioners -- via conjugate\ngradient (CG) efficiently. Experimental results show that our algorithm\nachieves competitive visual image quality while reducing computation complexity\nnoticeably.\n","authors":["Yeganeh Gharedaghi","Gene Cheung","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2307.02625v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13011v1","updated":"2023-07-24T13:47:30Z","published":"2023-07-24T13:47:30Z","title":"Maximal Independent Sets for Pooling in Graph Neural Networks","summary":"  Convolutional Neural Networks (CNNs) have enabled major advances in image\nclassification through convolution and pooling. In particular, image pooling\ntransforms a connected discrete lattice into a reduced lattice with the same\nconnectivity and allows reduction functions to consider all pixels in an image.\nHowever, there is no pooling that satisfies these properties for graphs. In\nfact, traditional graph pooling methods suffer from at least one of the\nfollowing drawbacks: Graph disconnection or overconnection, low decimation\nratio, and deletion of large parts of graphs. In this paper, we present three\npooling methods based on the notion of maximal independent sets that avoid\nthese pitfalls. Our experimental results confirm the relevance of maximal\nindependent set constraints for graph pooling.\n","authors":["Stevan Stanovic","Benoit Gaüzère","Luc Brun"],"pdf_url":"https://arxiv.org/pdf/2307.13011v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.09683v2","updated":"2023-07-24T15:41:03Z","published":"2023-07-18T23:35:53Z","title":"PubMed and Beyond: Recent Advances and Best Practices in Biomedical\n  Literature Search","summary":"  Biomedical research yields a wealth of information, much of which is only\naccessible through the literature. Consequently, literature search is an\nessential tool for building on prior knowledge in clinical and biomedical\nresearch. Although recent improvements in artificial intelligence have expanded\nfunctionality beyond keyword-based search, these advances may be unfamiliar to\nclinicians and researchers. In response, we present a survey of literature\nsearch tools tailored to both general and specific information needs in\nbiomedicine, with the objective of helping readers efficiently fulfill their\ninformation needs. We first examine the widely used PubMed search engine,\ndiscussing recent improvements and continued challenges. We then describe\nliterature search tools catering to five specific information needs: 1.\nIdentifying high-quality clinical research for evidence-based medicine. 2.\nRetrieving gene-related information for precision medicine and genomics. 3.\nSearching by meaning, including natural language questions. 4. Locating related\narticles with literature recommendation. 5. Mining literature to discover\nassociations between concepts such as diseases and genetic variants.\nAdditionally, we cover practical considerations and best practices for choosing\nand using these tools. Finally, we provide a perspective on the future of\nliterature search engines, considering recent breakthroughs in large language\nmodels such as ChatGPT. In summary, our survey provides a comprehensive view of\nbiomedical literature search functionalities with 36 publicly available tools.\n","authors":["Qiao Jin","Robert Leaman","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.09683v2.pdf","comment":"27 pages, 6 figures, 36 tools"},{"id":"http://arxiv.org/abs/2307.12810v1","updated":"2023-07-24T14:00:07Z","published":"2023-07-24T14:00:07Z","title":"HeteFedRec: Federated Recommender Systems with Model Heterogeneity","summary":"  Owing to the nature of privacy protection, federated recommender systems\n(FedRecs) have garnered increasing interest in the realm of on-device\nrecommender systems. However, most existing FedRecs only allow participating\nclients to collaboratively train a recommendation model of the same public\nparameter size. Training a model of the same size for all clients can lead to\nsuboptimal performance since clients possess varying resources. For example,\nclients with limited training data may prefer to train a smaller recommendation\nmodel to avoid excessive data consumption, while clients with sufficient data\nwould benefit from a larger model to achieve higher recommendation accuracy. To\naddress the above challenge, this paper introduces HeteFedRec, a novel FedRec\nframework that enables the assignment of personalized model sizes to\nparticipants. In HeteFedRec, we present a heterogeneous recommendation model\naggregation strategy, including a unified dual-task learning mechanism and a\ndimensional decorrelation regularization, to allow knowledge aggregation among\nrecommender models of different sizes. Additionally, a relation-based ensemble\nknowledge distillation method is proposed to effectively distil knowledge from\nheterogeneous item embeddings. Extensive experiments conducted on three\nreal-world recommendation datasets demonstrate the effectiveness and efficiency\nof HeteFedRec in training federated recommender systems under heterogeneous\nsettings.\n","authors":["Wei Yuan","Liang Qu","Lizhen Cui","Yongxin Tong","Xiaofang Zhou","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2307.12810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12798v1","updated":"2023-07-24T13:51:19Z","published":"2023-07-24T13:51:19Z","title":"RRAML: Reinforced Retrieval Augmented Machine Learning","summary":"  The emergence of large language models (LLMs) has revolutionized machine\nlearning and related fields, showcasing remarkable abilities in comprehending,\ngenerating, and manipulating human language. However, their conventional usage\nthrough API-based text prompt submissions imposes certain limitations in terms\nof context constraints and external source availability. To address these\nchallenges, we propose a novel framework called Reinforced Retrieval Augmented\nMachine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs\nwith supporting information retrieved by a purpose-built retriever from a vast\nuser-provided database. By leveraging recent advancements in reinforcement\nlearning, our method effectively addresses several critical challenges.\nFirstly, it circumvents the need for accessing LLM gradients. Secondly, our\nmethod alleviates the burden of retraining LLMs for specific tasks, as it is\noften impractical or impossible due to restricted access to the model and the\ncomputational intensity involved. Additionally we seamlessly link the\nretriever's task with the reasoner, mitigating hallucinations and reducing\nirrelevant, and potentially damaging retrieved documents. We believe that the\nresearch agenda outlined in this paper has the potential to profoundly impact\nthe field of AI, democratizing access to and utilization of LLMs for a wide\nrange of entities.\n","authors":["Andrea Bacciu","Florin Cocunasu","Federico Siciliano","Fabrizio Silvestri","Nicola Tonellotto","Giovanni Trappolini"],"pdf_url":"https://arxiv.org/pdf/2307.12798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12756v1","updated":"2023-07-24T12:58:47Z","published":"2023-07-24T12:58:47Z","title":"Unbiased Delayed Feedback Label Correction for Conversion Rate\n  Prediction","summary":"  Conversion rate prediction is critical to many online applications such as\ndigital display advertising. To capture dynamic data distribution, industrial\nsystems often require retraining models on recent data daily or weekly.\nHowever, the delay of conversion behavior usually leads to incorrect labeling,\nwhich is called delayed feedback problem. Existing work may fail to introduce\nthe correct information about false negative samples due to data sparsity and\ndynamic data distribution. To directly introduce the correct feedback label\ninformation, we propose an Unbiased delayed feedback Label Correction framework\n(ULC), which uses an auxiliary model to correct labels for observed negative\nfeedback samples. Firstly, we theoretically prove that the label-corrected loss\nis an unbiased estimate of the oracle loss using true labels. Then, as there\nare no ready training data for label correction, counterfactual labeling is\nused to construct artificial training data. Furthermore, since counterfactual\nlabeling utilizes only partial training data, we design an embedding-based\nalternative training method to enhance performance. Comparative experiments on\nboth public and private datasets and detailed analyses show that our proposed\napproach effectively alleviates the delayed feedback problem and consistently\noutperforms the previous state-of-the-art methods.\n","authors":["Yifan Wang","Peijie Sun","Min Zhang","Qinglin Jia","Jingjie Li","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2307.12756v1.pdf","comment":"accepted by KDD 2023"},{"id":"http://arxiv.org/abs/2307.12576v1","updated":"2023-07-24T07:47:21Z","published":"2023-07-24T07:47:21Z","title":"Self-refining of Pseudo Labels for Music Source Separation with Noisy\n  Labeled Data","summary":"  Music source separation (MSS) faces challenges due to the limited\navailability of correctly-labeled individual instrument tracks. With the push\nto acquire larger datasets to improve MSS performance, the inevitability of\nencountering mislabeled individual instrument tracks becomes a significant\nchallenge to address. This paper introduces an automated technique for refining\nthe labels in a partially mislabeled dataset. Our proposed self-refining\ntechnique, employed with a noisy-labeled dataset, results in only a 1% accuracy\ndegradation in multi-label instrument recognition compared to a classifier\ntrained on a clean-labeled dataset. The study demonstrates the importance of\nrefining noisy-labeled data in MSS model training and shows that utilizing the\nrefined dataset leads to comparable results derived from a clean-labeled\ndataset. Notably, upon only access to a noisy dataset, MSS models trained on a\nself-refined dataset even outperform those trained on a dataset refined with a\nclassifier trained on clean labels.\n","authors":["Junghyun Koo","Yunkee Chae","Chang-Bin Jeon","Kyogu Lee"],"pdf_url":"https://arxiv.org/pdf/2307.12576v1.pdf","comment":"24th International Society for Music Information Retrieval Conference\n  (ISMIR 2023)"},{"id":"http://arxiv.org/abs/2307.10617v3","updated":"2023-07-24T07:03:01Z","published":"2023-07-20T06:35:43Z","title":"Unmasking Falsehoods in Reviews: An Exploration of NLP Techniques","summary":"  In the contemporary digital landscape, online reviews have become an\nindispensable tool for promoting products and services across various\nbusinesses. Marketers, advertisers, and online businesses have found incentives\nto create deceptive positive reviews for their products and negative reviews\nfor their competitors' offerings. As a result, the writing of deceptive reviews\nhas become an unavoidable practice for businesses seeking to promote themselves\nor undermine their rivals. Detecting such deceptive reviews has become an\nintense and ongoing area of research. This research paper proposes a machine\nlearning model to identify deceptive reviews, with a particular focus on\nrestaurants. This study delves into the performance of numerous experiments\nconducted on a dataset of restaurant reviews known as the Deceptive Opinion\nSpam Corpus. To accomplish this, an n-gram model and max features are developed\nto effectively identify deceptive content, particularly focusing on fake\nreviews. A benchmark study is undertaken to explore the performance of two\ndifferent feature extraction techniques, which are then coupled with five\ndistinct machine learning classification algorithms. The experimental results\nreveal that the passive aggressive classifier stands out among the various\nalgorithms, showcasing the highest accuracy not only in text classification but\nalso in identifying fake reviews. Moreover, the research delves into data\naugmentation and implements various deep learning techniques to further enhance\nthe process of detecting deceptive reviews. The findings shed light on the\nefficacy of the proposed machine learning approach and offer valuable insights\ninto dealing with deceptive reviews in the realm of online businesses.\n","authors":["Anusuya Baby Hari Krishnan"],"pdf_url":"https://arxiv.org/pdf/2307.10617v3.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.12518v1","updated":"2023-07-24T04:23:08Z","published":"2023-07-24T04:23:08Z","title":"FaFCNN: A General Disease Classification Framework Based on Feature\n  Fusion Neural Networks","summary":"  There are two fundamental problems in applying deep learning/machine learning\nmethods to disease classification tasks, one is the insufficient number and\npoor quality of training samples; another one is how to effectively fuse\nmultiple source features and thus train robust classification models. To\naddress these problems, inspired by the process of human learning knowledge, we\npropose the Feature-aware Fusion Correlation Neural Network (FaFCNN), which\nintroduces a feature-aware interaction module and a feature alignment module\nbased on domain adversarial learning. This is a general framework for disease\nclassification, and FaFCNN improves the way existing methods obtain sample\ncorrelation features. The experimental results show that training using\naugmented features obtained by pre-training gradient boosting decision tree\nyields more performance gains than random-forest based methods. On the\nlow-quality dataset with a large amount of missing data in our setup, FaFCNN\nobtains a consistently optimal performance compared to competitive baselines.\nIn addition, extensive experiments demonstrate the robustness of the proposed\nmethod and the effectiveness of each component of the model\\footnote{Accepted\nin IEEE SMC2023}.\n","authors":["Menglin Kong","Shaojie Zhao","Juan Cheng","Xingquan Li","Ri Su","Muzhou Hou","Cong Cao"],"pdf_url":"https://arxiv.org/pdf/2307.12518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13165v1","updated":"2023-07-24T23:26:46Z","published":"2023-07-24T23:26:46Z","title":"Investigating the Robustness of Sequential Recommender Systems Against\n  Training Data Perturbations: an Empirical Study","summary":"  Sequential Recommender Systems (SRSs) have been widely used to model user\nbehavior over time, but their robustness in the face of perturbations to\ntraining data is a critical issue. In this paper, we conduct an empirical study\nto investigate the effects of removing items at different positions within a\ntemporally ordered sequence. We evaluate two different SRS models on multiple\ndatasets, measuring their performance using Normalized Discounted Cumulative\nGain (NDCG) and Rank Sensitivity List metrics. Our results demonstrate that\nremoving items at the end of the sequence significantly impacts performance,\nwith NDCG decreasing up to 60\\%, while removing items from the beginning or\nmiddle has no significant effect. These findings highlight the importance of\nconsidering the position of the perturbed items in the training data and shall\ninform the design of more robust SRSs.\n","authors":["Filippo Betello","Federico Siciliano","Pushkar Mishra","Fabrizio Silvestri"],"pdf_url":"https://arxiv.org/pdf/2307.13165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.15498v2","updated":"2023-07-24T20:08:20Z","published":"2021-06-29T15:25:33Z","title":"Classification of Consumer Belief Statements From Social Media","summary":"  Social media offer plenty of information to perform market research in order\nto meet the requirements of customers. One way how this research is conducted\nis that a domain expert gathers and categorizes user-generated content into a\ncomplex and fine-grained class structure. In many of such cases, little data\nmeets complex annotations. It is not yet fully understood how this can be\nleveraged successfully for classification. We examine the classification\naccuracy of expert labels when used with a) many fine-grained classes and b)\nfew abstract classes. For scenario b) we compare abstract class labels given by\nthe domain expert as baseline and by automatic hierarchical clustering. We\ncompare this to another baseline where the entire class structure is given by a\ncompletely unsupervised clustering approach. By doing so, this work can serve\nas an example of how complex expert annotations are potentially beneficial and\ncan be utilized in the most optimal way for opinion mining in highly specific\ndomains. By exploring across a range of techniques and experiments, we find\nthat automated class abstraction approaches in particular the unsupervised\napproach performs remarkably well against domain expert baseline on text\nclassification tasks. This has the potential to inspire opinion mining\napplications in order to support market researchers in practice and to inspire\nfine-grained automated content analysis on a large scale.\n","authors":["Gerhard Johann Hagerer","Wenbin Le","Hannah Danner","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2106.15498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.02259v3","updated":"2023-07-24T20:03:14Z","published":"2021-11-03T14:49:50Z","title":"A Case Study and Qualitative Analysis of Simple Cross-Lingual Opinion\n  Mining","summary":"  User-generated content from social media is produced in many languages,\nmaking it technically challenging to compare the discussed themes from one\ndomain across different cultures and regions. It is relevant for domains in a\nglobalized world, such as market research, where people from two nations and\nmarkets might have different requirements for a product. We propose a simple,\nmodern, and effective method for building a single topic model with sentiment\nanalysis capable of covering multiple languages simultanteously, based on a\npre-trained state-of-the-art deep neural network for natural language\nunderstanding. To demonstrate its feasibility, we apply the model to newspaper\narticles and user comments of a specific domain, i.e., organic food products\nand related consumption behavior. The themes match across languages.\nAdditionally, we obtain an high proportion of stable and domain-relevant\ntopics, a meaningful relation between topics and their respective textual\ncontents, and an interpretable representation for social media documents.\nMarketing can potentially benefit from our method, since it provides an\neasy-to-use means of addressing specific customer interests from different\nmarket regions around the globe. For reproducibility, we provide the code,\ndata, and results of our study.\n","authors":["Gerhard Johann Hagerer","Wing Sheung Leung","Qiaoxi Liu","Hannah Danner","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2111.02259v3.pdf","comment":"10 pages, 2 tables, 5 figures, full paper, peer-reviewed, published\n  at KDIR/IC3k 2021 conference"},{"id":"http://arxiv.org/abs/2304.04759v2","updated":"2023-07-24T18:10:09Z","published":"2023-04-07T23:10:39Z","title":"Similarity search in the blink of an eye with compressed indices","summary":"  Nowadays, data is represented by vectors. Retrieving those vectors, among\nmillions and billions, that are similar to a given query is a ubiquitous\nproblem, known as similarity search, of relevance for a wide range of\napplications. Graph-based indices are currently the best performing techniques\nfor billion-scale similarity search. However, their random-access memory\npattern presents challenges to realize their full potential. In this work, we\npresent new techniques and systems for creating faster and smaller graph-based\nindices. To this end, we introduce a novel vector compression method,\nLocally-adaptive Vector Quantization (LVQ), that uses per-vector scaling and\nscalar quantization to improve search performance with fast similarity\ncomputations and a reduced effective bandwidth, while decreasing memory\nfootprint and barely impacting accuracy. LVQ, when combined with a new\nhigh-performance computing system for graph-based similarity search,\nestablishes the new state of the art in terms of performance and memory\nfootprint. For billions of vectors, LVQ outcompetes the second-best\nalternatives: (1) in the low-memory regime, by up to 20.7x in throughput with\nup to a 3x memory footprint reduction, and (2) in the high-throughput regime by\n5.8x with 1.4x less memory.\n","authors":["Cecilia Aguerrebere","Ishwar Bhati","Mark Hildebrand","Mariano Tepper","Ted Willke"],"pdf_url":"https://arxiv.org/pdf/2304.04759v2.pdf","comment":"VLDB 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.12983v1","updated":"2023-07-24T17:59:37Z","published":"2023-07-24T17:59:37Z","title":"Parallel $Q$-Learning: Scaling Off-policy Reinforcement Learning under\n  Massively Parallel Simulation","summary":"  Reinforcement learning is time-consuming for complex tasks due to the need\nfor large amounts of training data. Recent advances in GPU-based simulation,\nsuch as Isaac Gym, have sped up data collection thousands of times on a\ncommodity GPU. Most prior works used on-policy methods like PPO due to their\nsimplicity and ease of scaling. Off-policy methods are more data efficient but\nchallenging to scale, resulting in a longer wall-clock training time. This\npaper presents a Parallel $Q$-Learning (PQL) scheme that outperforms PPO in\nwall-clock time while maintaining superior sample efficiency of off-policy\nlearning. PQL achieves this by parallelizing data collection, policy learning,\nand value learning. Different from prior works on distributed off-policy\nlearning, such as Apex, our scheme is designed specifically for massively\nparallel GPU-based simulation and optimized to work on a single workstation. In\nexperiments, we demonstrate that $Q$-learning can be scaled to \\textit{tens of\nthousands of parallel environments} and investigate important factors affecting\nlearning speed. The code is available at https://github.com/Improbable-AI/pql.\n","authors":["Zechu Li","Tao Chen","Zhang-Wei Hong","Anurag Ajay","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2307.12983v1.pdf","comment":"Accepted by ICML 2023"},{"id":"http://arxiv.org/abs/2307.12981v1","updated":"2023-07-24T17:59:02Z","published":"2023-07-24T17:59:02Z","title":"3D-LLM: Injecting the 3D World into Large Language Models","summary":"  Large language models (LLMs) and Vision-Language Models (VLMs) have been\nproven to excel at multiple tasks, such as commonsense reasoning. Powerful as\nthese models can be, they are not grounded in the 3D physical world, which\ninvolves richer concepts such as spatial relationships, affordances, physics,\nlayout, and so on. In this work, we propose to inject the 3D world into large\nlanguage models and introduce a whole new family of 3D-LLMs. Specifically,\n3D-LLMs can take 3D point clouds and their features as input and perform a\ndiverse set of 3D-related tasks, including captioning, dense captioning, 3D\nquestion answering, task decomposition, 3D grounding, 3D-assisted dialog,\nnavigation, and so on. Using three types of prompting mechanisms that we\ndesign, we are able to collect over 300k 3D-language data covering these tasks.\nTo efficiently train 3D-LLMs, we first utilize a 3D feature extractor that\nobtains 3D features from rendered multi- view images. Then, we use 2D VLMs as\nour backbones to train our 3D-LLMs. By introducing a 3D localization mechanism,\n3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show\nthat our model outperforms state-of-the-art baselines by a large margin (e.g.,\nthe BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore,\nexperiments on our held-in datasets for 3D captioning, task composition, and\n3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative\nexamples also show that our model could perform more tasks beyond the scope of\nexisting LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.\n","authors":["Yining Hong","Haoyu Zhen","Peihao Chen","Shuhong Zheng","Yilun Du","Zhenfang Chen","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2307.12981v1.pdf","comment":"Project Page: : https://vis-www.cs.umass.edu/3dllm/"},{"id":"http://arxiv.org/abs/2303.06147v2","updated":"2023-07-24T17:58:45Z","published":"2023-03-10T18:59:57Z","title":"Exphormer: Sparse Transformers for Graphs","summary":"  Graph transformers have emerged as a promising architecture for a variety of\ngraph learning and representation tasks. Despite their successes, though, it\nremains challenging to scale graph transformers to large graphs while\nmaintaining accuracy competitive with message-passing networks. In this paper,\nwe introduce Exphormer, a framework for building powerful and scalable graph\ntransformers. Exphormer consists of a sparse attention mechanism based on two\nmechanisms: virtual global nodes and expander graphs, whose mathematical\ncharacteristics, such as spectral expansion, pseduorandomness, and sparsity,\nyield graph transformers with complexity only linear in the size of the graph,\nwhile allowing us to prove desirable theoretical properties of the resulting\ntransformer models. We show that incorporating Exphormer into the\nrecently-proposed GraphGPS framework produces models with competitive empirical\nresults on a wide variety of graph datasets, including state-of-the-art results\non three datasets. We also show that Exphormer can scale to datasets on larger\ngraphs than shown in previous graph transformer architectures. Code can be\nfound at \\url{https://github.com/hamed1375/Exphormer}.\n","authors":["Hamed Shirzad","Ameya Velingker","Balaji Venkatachalam","Danica J. Sutherland","Ali Kemal Sinop"],"pdf_url":"https://arxiv.org/pdf/2303.06147v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05407v3","updated":"2023-07-24T17:58:31Z","published":"2022-09-12T16:59:36Z","title":"Segmenting Known Objects and Unseen Unknowns without Prior Knowledge","summary":"  Panoptic segmentation methods assign a known class to each pixel given in\ninput. Even for state-of-the-art approaches, this inevitably enforces decisions\nthat systematically lead to wrong predictions for objects outside the training\ncategories. However, robustness against out-of-distribution samples and corner\ncases is crucial in safety-critical settings to avoid dangerous consequences.\nSince real-world datasets cannot contain enough data points to adequately\nsample the long tail of the underlying distribution, models must be able to\ndeal with unseen and unknown scenarios as well. Previous methods targeted this\nby re-identifying already-seen unlabeled objects. In this work, we propose the\nnecessary step to extend segmentation with a new setting which we term holistic\nsegmentation. Holistic segmentation aims to identify and separate objects of\nunseen unknown categories into instances, without any prior knowledge about\nthem, while performing panoptic segmentation of known classes. We tackle this\nnew problem with U3HS, which finds unknowns as highly uncertain regions and\nclusters their corresponding instance-aware embeddings into individual objects.\nBy doing so, for the first time in panoptic segmentation with unknown objects,\nour U3HS is trained without unknown categories, reducing assumptions and\nleaving the settings as unconstrained as in real-life scenarios. Extensive\nexperiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate\nthe effectiveness of U3HS for this new, challenging, and assumptions-free\nsetting called holistic segmentation.\n","authors":["Stefano Gasperini","Alvaro Marcos-Ramiro","Michael Schmidt","Nassir Navab","Benjamin Busam","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2209.05407v3.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12979v1","updated":"2023-07-24T17:56:58Z","published":"2023-07-24T17:56:58Z","title":"An Isometric Stochastic Optimizer","summary":"  The Adam optimizer is the standard choice in deep learning applications. I\npropose a simple explanation of Adam's success: it makes each parameter's step\nsize independent of the norms of the other parameters. Based on this principle\nI derive Iso, a new optimizer which makes the norm of a parameter's update\ninvariant to the application of any linear transformation to its inputs and\noutputs. I develop a variant of Iso called IsoAdam that allows optimal\nhyperparameters to be transferred from Adam, and demonstrate that IsoAdam\nobtains a speedup over Adam when training a small Transformer.\n","authors":["Jacob Jackson"],"pdf_url":"https://arxiv.org/pdf/2307.12979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12975v1","updated":"2023-07-24T17:50:24Z","published":"2023-07-24T17:50:24Z","title":"Provable Benefits of Policy Learning from Human Preferences in\n  Contextual Bandit Problems","summary":"  A crucial task in decision-making problems is reward engineering. It is\ncommon in practice that no obvious choice of reward function exists. Thus, a\npopular approach is to introduce human feedback during training and leverage\nsuch feedback to learn a reward function. Among all policy learning methods\nthat use human feedback, preference-based methods have demonstrated substantial\nsuccess in recent empirical applications such as InstructGPT. In this work, we\ndevelop a theory that provably shows the benefits of preference-based methods\nin offline contextual bandits. In particular, we improve the modeling and\nsuboptimality analysis for running policy learning methods on human-scored\nsamples directly. Then, we compare it with the suboptimality guarantees of\npreference-based methods and show that preference-based methods enjoy lower\nsuboptimality.\n","authors":["Xiang Ji","Huazheng Wang","Minshuo Chen","Tuo Zhao","Mengdi Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12971v1","updated":"2023-07-24T17:49:05Z","published":"2023-07-24T17:49:05Z","title":"Big Data - Supply Chain Management Framework for Forecasting: Data\n  Preprocessing and Machine Learning Techniques","summary":"  This article intends to systematically identify and comparatively analyze\nstate-of-the-art supply chain (SC) forecasting strategies and technologies. A\nnovel framework has been proposed incorporating Big Data Analytics in SC\nManagement (problem identification, data sources, exploratory data analysis,\nmachine-learning model training, hyperparameter tuning, performance evaluation,\nand optimization), forecasting effects on human-workforce, inventory, and\noverall SC. Initially, the need to collect data according to SC strategy and\nhow to collect them has been discussed. The article discusses the need for\ndifferent types of forecasting according to the period or SC objective. The SC\nKPIs and the error-measurement systems have been recommended to optimize the\ntop-performing model. The adverse effects of phantom inventory on forecasting\nand the dependence of managerial decisions on the SC KPIs for determining model\nperformance parameters and improving operations management, transparency, and\nplanning efficiency have been illustrated. The cyclic connection within the\nframework introduces preprocessing optimization based on the post-process KPIs,\noptimizing the overall control process (inventory management, workforce\ndetermination, cost, production and capacity planning). The contribution of\nthis research lies in the standard SC process framework proposal, recommended\nforecasting data analysis, forecasting effects on SC performance, machine\nlearning algorithms optimization followed, and in shedding light on future\nresearch.\n","authors":["Md Abrar Jahin","Md Sakib Hossain Shovon","Jungpil Shin","Istiyaque Ahmed Ridoy","Yoichi Tomioka","M. F. Mridha"],"pdf_url":"https://arxiv.org/pdf/2307.12971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12968v1","updated":"2023-07-24T17:46:32Z","published":"2023-07-24T17:46:32Z","title":"A Connection between One-Step Regularization and Critic Regularization\n  in Reinforcement Learning","summary":"  As with any machine learning problem with limited data, effective offline RL\nalgorithms require careful regularization to avoid overfitting. One-step\nmethods perform regularization by doing just a single step of policy\nimprovement, while critic regularization methods do many steps of policy\nimprovement with a regularized objective. These methods appear distinct.\nOne-step methods, such as advantage-weighted regression and conditional\nbehavioral cloning, truncate policy iteration after just one step. This ``early\nstopping'' makes one-step RL simple and stable, but can limit its asymptotic\nperformance. Critic regularization typically requires more compute but has\nappealing lower-bound guarantees. In this paper, we draw a close connection\nbetween these methods: applying a multi-step critic regularization method with\na regularization coefficient of 1 yields the same policy as one-step RL. While\npractical implementations violate our assumptions and critic regularization is\ntypically applied with smaller regularization coefficients, our experiments\nnevertheless show that our analysis makes accurate, testable predictions about\npractical offline RL methods (CQL and one-step RL) with commonly-used\nhyperparameters. Our results that every problem can be solved with a single\nstep of policy improvement, but rather that one-step RL might be competitive\nwith critic regularization on RL problems that demand strong regularization.\n","authors":["Benjamin Eysenbach","Matthieu Geist","Sergey Levine","Ruslan Salakhutdinov"],"pdf_url":"https://arxiv.org/pdf/2307.12968v1.pdf","comment":"Accepted to ICML 2023. Video\n  (https://www.youtube.com/watch?v=1xlixIHZ0R4) and code\n  (https://github.com/ben-eysenbach/ac-connection)"},{"id":"http://arxiv.org/abs/2307.12967v1","updated":"2023-07-24T17:45:40Z","published":"2023-07-24T17:45:40Z","title":"Learning Dense Correspondences between Photos and Sketches","summary":"  Humans effortlessly grasp the connection between sketches and real-world\nobjects, even when these sketches are far from realistic. Moreover, human\nsketch understanding goes beyond categorization -- critically, it also entails\nunderstanding how individual elements within a sketch correspond to parts of\nthe physical world it represents. What are the computational ingredients needed\nto support this ability? Towards answering this question, we make two\ncontributions: first, we introduce a new sketch-photo correspondence benchmark,\n$\\textit{PSC6k}$, containing 150K annotations of 6250 sketch-photo pairs across\n125 object categories, augmenting the existing Sketchy dataset with\nfine-grained correspondence metadata. Second, we propose a self-supervised\nmethod for learning dense correspondences between sketch-photo pairs, building\nupon recent advances in correspondence learning for pairs of photos. Our model\nuses a spatial transformer network to estimate the warp flow between latent\nrepresentations of a sketch and photo extracted by a contrastive learning-based\nConvNet backbone. We found that this approach outperformed several strong\nbaselines and produced predictions that were quantitatively consistent with\nother warp-based methods. However, our benchmark also revealed systematic\ndifferences between predictions of the suite of models we tested and those of\nhumans. Taken together, our work suggests a promising path towards developing\nartificial systems that achieve more human-like understanding of visual images\nat different levels of abstraction. Project page:\nhttps://photo-sketch-correspondence.github.io\n","authors":["Xuanchen Lu","Xiaolong Wang","Judith E Fan"],"pdf_url":"https://arxiv.org/pdf/2307.12967v1.pdf","comment":"Accepted to ICML 2023. Project page:\n  https://photo-sketch-correspondence.github.io"},{"id":"http://arxiv.org/abs/2303.04245v2","updated":"2023-07-24T17:29:04Z","published":"2023-03-07T21:42:17Z","title":"How Do Transformers Learn Topic Structure: Towards a Mechanistic\n  Understanding","summary":"  While the successes of transformers across many domains are indisputable,\naccurate understanding of the learning mechanics is still largely lacking.\nTheir capabilities have been probed on benchmarks which include a variety of\nstructured and reasoning tasks -- but mathematical understanding is lagging\nsubstantially behind. Recent lines of work have begun studying representational\naspects of this question: that is, the size/depth/complexity of attention-based\nnetworks to perform certain tasks. However, there is no guarantee the learning\ndynamics will converge to the constructions proposed. In our paper, we provide\nfine-grained mechanistic understanding of how transformers learn \"semantic\nstructure\", understood as capturing co-occurrence structure of words.\nPrecisely, we show, through a combination of mathematical analysis and\nexperiments on Wikipedia data and synthetic data modeled by Latent Dirichlet\nAllocation (LDA), that the embedding layer and the self-attention layer encode\nthe topical structure. In the former case, this manifests as higher average\ninner product of embeddings between same-topic words. In the latter, it\nmanifests as higher average pairwise attention between same-topic words. The\nmathematical results involve several assumptions to make the analysis\ntractable, which we verify on data, and might be of independent interest as\nwell.\n","authors":["Yuchen Li","Yuanzhi Li","Andrej Risteski"],"pdf_url":"https://arxiv.org/pdf/2303.04245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12943v1","updated":"2023-07-24T17:15:38Z","published":"2023-07-24T17:15:38Z","title":"Efficiently Sampling the PSD Cone with the Metric Dikin Walk","summary":"  Semi-definite programs represent a frontier of efficient computation. While\nthere has been much progress on semi-definite optimization, with moderate-sized\ninstances currently solvable in practice by the interior-point method, the\nbasic problem of sampling semi-definite solutions remains a formidable\nchallenge. The direct application of known polynomial-time algorithms for\nsampling general convex bodies to semi-definite sampling leads to a\nprohibitively high running time. In addition, known general methods require an\nexpensive rounding phase as pre-processing. Here we analyze the Dikin walk, by\nfirst adapting it to general metrics, then devising suitable metrics for the\nPSD cone with affine constraints. The resulting mixing time and per-step\ncomplexity are considerably smaller, and by an appropriate choice of the\nmetric, the dependence on the number of constraints can be made\npolylogarithmic. We introduce a refined notion of self-concordant matrix\nfunctions and give rules for combining different metrics. Along the way, we\nfurther develop the theory of interior-point methods for sampling.\n","authors":["Yunbum Kook","Santosh S. Vempala"],"pdf_url":"https://arxiv.org/pdf/2307.12943v1.pdf","comment":"54 pages"},{"id":"http://arxiv.org/abs/2307.12941v1","updated":"2023-07-24T17:11:39Z","published":"2023-07-24T17:11:39Z","title":"On Privileged and Convergent Bases in Neural Network Representations","summary":"  In this study, we investigate whether the representations learned by neural\nnetworks possess a privileged and convergent basis. Specifically, we examine\nthe significance of feature directions represented by individual neurons.\nFirst, we establish that arbitrary rotations of neural representations cannot\nbe inverted (unlike linear networks), indicating that they do not exhibit\ncomplete rotational invariance. Subsequently, we explore the possibility of\nmultiple bases achieving identical performance. To do this, we compare the\nbases of networks trained with the same parameters but with varying random\ninitializations. Our study reveals two findings: (1) Even in wide networks such\nas WideResNets, neural networks do not converge to a unique basis; (2) Basis\ncorrelation increases significantly when a few early layers of the network are\nfrozen identically.\n  Furthermore, we analyze Linear Mode Connectivity, which has been studied as a\nmeasure of basis correlation. Our findings give evidence that while Linear Mode\nConnectivity improves with increased network width, this improvement is not due\nto an increase in basis correlation.\n","authors":["Davis Brown","Nikhil Vyas","Yamini Bansal"],"pdf_url":"https://arxiv.org/pdf/2307.12941v1.pdf","comment":"In the Workshop on High-dimensional Learning Dynamics at ICML 2023"},{"id":"http://arxiv.org/abs/2307.08572v3","updated":"2023-07-24T17:01:50Z","published":"2023-07-17T15:38:11Z","title":"Revisiting the Robustness of the Minimum Error Entropy Criterion: A\n  Transfer Learning Case Study","summary":"  Coping with distributional shifts is an important part of transfer learning\nmethods in order to perform well in real-life tasks. However, most of the\nexisting approaches in this area either focus on an ideal scenario in which the\ndata does not contain noises or employ a complicated training paradigm or model\ndesign to deal with distributional shifts. In this paper, we revisit the\nrobustness of the minimum error entropy (MEE) criterion, a widely used\nobjective in statistical signal processing to deal with non-Gaussian noises,\nand investigate its feasibility and usefulness in real-life transfer learning\nregression tasks, where distributional shifts are common. Specifically, we put\nforward a new theoretical result showing the robustness of MEE against\ncovariate shift. We also show that by simply replacing the mean squared error\n(MSE) loss with the MEE on basic transfer learning algorithms such as\nfine-tuning and linear probing, we can achieve competitive performance with\nrespect to state-of-the-art transfer learning algorithms. We justify our\narguments on both synthetic data and 5 real-world time-series data.\n","authors":["Luis Pedro Silvestrin","Shujian Yu","Mark Hoogendoorn"],"pdf_url":"https://arxiv.org/pdf/2307.08572v3.pdf","comment":"Manuscript accepted at ECAI-23. Code available at\n  https://github.com/lpsilvestrin/mee-finetune"},{"id":"http://arxiv.org/abs/2307.12926v1","updated":"2023-07-24T16:36:04Z","published":"2023-07-24T16:36:04Z","title":"Contextual Bandits and Imitation Learning via Preference-Based Active\n  Queries","summary":"  We consider the problem of contextual bandits and imitation learning, where\nthe learner lacks direct knowledge of the executed action's reward. Instead,\nthe learner can actively query an expert at each round to compare two actions\nand receive noisy preference feedback. The learner's objective is two-fold: to\nminimize the regret associated with the executed actions, while simultaneously,\nminimizing the number of comparison queries made to the expert. In this paper,\nwe assume that the learner has access to a function class that can represent\nthe expert's preference model under appropriate link functions, and provide an\nalgorithm that leverages an online regression oracle with respect to this\nfunction class for choosing its actions and deciding when to query. For the\ncontextual bandit setting, our algorithm achieves a regret bound that combines\nthe best of both worlds, scaling as $O(\\min\\{\\sqrt{T}, d/\\Delta\\})$, where $T$\nrepresents the number of interactions, $d$ represents the eluder dimension of\nthe function class, and $\\Delta$ represents the minimum preference of the\noptimal action over any suboptimal action under all contexts. Our algorithm\ndoes not require the knowledge of $\\Delta$, and the obtained regret bound is\ncomparable to what can be achieved in the standard contextual bandits setting\nwhere the learner observes reward signals at each round. Additionally, our\nalgorithm makes only $O(\\min\\{T, d^2/\\Delta^2\\})$ queries to the expert. We\nthen extend our algorithm to the imitation learning setting, where the learning\nagent engages with an unknown environment in episodes of length $H$ each, and\nprovide similar guarantees for regret and query complexity. Interestingly, our\nalgorithm for imitation learning can even learn to outperform the underlying\nexpert, when it is suboptimal, highlighting a practical benefit of\npreference-based feedback in imitation learning.\n","authors":["Ayush Sekhari","Karthik Sridharan","Wen Sun","Runzhe Wu"],"pdf_url":"https://arxiv.org/pdf/2307.12926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12231v2","updated":"2023-07-24T16:00:37Z","published":"2023-04-24T16:18:22Z","title":"An Approximation Theory for Metric Space-Valued Functions With A View\n  Towards Deep Learning","summary":"  Motivated by the developing mathematics of deep learning, we build universal\nfunctions approximators of continuous maps between arbitrary Polish metric\nspaces $\\mathcal{X}$ and $\\mathcal{Y}$ using elementary functions between\nEuclidean spaces as building blocks. Earlier results assume that the target\nspace $\\mathcal{Y}$ is a topological vector space. We overcome this limitation\nby ``randomization'': our approximators output discrete probability measures\nover $\\mathcal{Y}$. When $\\mathcal{X}$ and $\\mathcal{Y}$ are Polish without\nadditional structure, we prove very general qualitative guarantees; when they\nhave suitable combinatorial structure, we prove quantitative guarantees for\nH\\\"{o}lder-like maps, including maps between finite graphs, solution operators\nto rough differential equations between certain Carnot groups, and continuous\nnon-linear operators between Banach spaces arising in inverse problems. In\nparticular, we show that the required number of Dirac measures is determined by\nthe combinatorial structure of $\\mathcal{X}$ and $\\mathcal{Y}$. For barycentric\n$\\mathcal{Y}$, including Banach spaces, $\\mathbb{R}$-trees, Hadamard manifolds,\nor Wasserstein spaces on Polish metric spaces, our approximators reduce to\n$\\mathcal{Y}$-valued functions. When the Euclidean approximators are neural\nnetworks, our constructions generalize transformer networks, providing a new\nprobabilistic viewpoint of geometric deep learning.\n","authors":["Anastasis Kratsios","Chong Liu","Matti Lassas","Maarten V. de Hoop","Ivan Dokmanić"],"pdf_url":"https://arxiv.org/pdf/2304.12231v2.pdf","comment":"14 Figures, 3 Tables, 78 Pages (Main 40, Proofs 26, Acknowledgments\n  and References 12)"},{"id":"http://arxiv.org/abs/2307.12906v1","updated":"2023-07-24T15:59:36Z","published":"2023-07-24T15:59:36Z","title":"QAmplifyNet: Pushing the Boundaries of Supply Chain Backorder Prediction\n  Using Interpretable Hybrid Quantum - Classical Neural Network","summary":"  Supply chain management relies on accurate backorder prediction for\noptimizing inventory control, reducing costs, and enhancing customer\nsatisfaction. However, traditional machine-learning models struggle with\nlarge-scale datasets and complex relationships, hindering real-world data\ncollection. This research introduces a novel methodological framework for\nsupply chain backorder prediction, addressing the challenge of handling large\ndatasets. Our proposed model, QAmplifyNet, employs quantum-inspired techniques\nwithin a quantum-classical neural network to predict backorders effectively on\nshort and imbalanced datasets. Experimental evaluations on a benchmark dataset\ndemonstrate QAmplifyNet's superiority over classical models, quantum ensembles,\nquantum neural networks, and deep reinforcement learning. Its proficiency in\nhandling short, imbalanced datasets makes it an ideal solution for supply chain\nmanagement. To enhance model interpretability, we use Explainable Artificial\nIntelligence techniques. Practical implications include improved inventory\ncontrol, reduced backorders, and enhanced operational efficiency. QAmplifyNet\nseamlessly integrates into real-world supply chain management systems, enabling\nproactive decision-making and efficient resource allocation. Future work\ninvolves exploring additional quantum-inspired techniques, expanding the\ndataset, and investigating other supply chain applications. This research\nunlocks the potential of quantum computing in supply chain optimization and\npaves the way for further exploration of quantum-inspired machine learning\nmodels in supply chain management. Our framework and QAmplifyNet model offer a\nbreakthrough approach to supply chain backorder prediction, providing superior\nperformance and opening new avenues for leveraging quantum-inspired techniques\nin supply chain management.\n","authors":["Md Abrar Jahin","Md Sakib Hossain Shovon","Md. Saiful Islam","Jungpil Shin","M. F. Mridha","Yuichi Okuyama"],"pdf_url":"https://arxiv.org/pdf/2307.12906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12904v1","updated":"2023-07-24T15:52:33Z","published":"2023-07-24T15:52:33Z","title":"Universal Approximation Theorem and error bounds for quantum neural\n  networks and quantum reservoirs","summary":"  Universal approximation theorems are the foundations of classical neural\nnetworks, providing theoretical guarantees that the latter are able to\napproximate maps of interest. Recent results have shown that this can also be\nachieved in a quantum setting, whereby classical functions can be approximated\nby parameterised quantum circuits. We provide here precise error bounds for\nspecific classes of functions and extend these results to the interesting new\nsetup of randomised quantum circuits, mimicking classical reservoir neural\nnetworks. Our results show in particular that a quantum neural network with\n$\\mathcal{O}(\\varepsilon^{-2})$ weights and $\\mathcal{O} (\\lceil\n\\log_2(\\varepsilon^{-1}) \\rceil)$ qubits suffices to achieve accuracy\n$\\varepsilon>0$ when approximating functions with integrable Fourier transform.\n","authors":["Lukas Gonon","Antoine Jacquier"],"pdf_url":"https://arxiv.org/pdf/2307.12904v1.pdf","comment":"20 pages, 0 figure"},{"id":"http://arxiv.org/abs/2206.02909v2","updated":"2023-07-24T15:47:59Z","published":"2022-06-06T21:14:01Z","title":"Self-supervised Learning for Human Activity Recognition Using 700,000\n  Person-days of Wearable Data","summary":"  Advances in deep learning for human activity recognition have been relatively\nlimited due to the lack of large labelled datasets. In this study, we leverage\nself-supervised learning techniques on the UK-Biobank activity tracker\ndataset--the largest of its kind to date--containing more than 700,000\nperson-days of unlabelled wearable sensor data. Our resulting activity\nrecognition model consistently outperformed strong baselines across seven\nbenchmark datasets, with an F1 relative improvement of 2.5%-100% (median\n18.4%), the largest improvements occurring in the smaller datasets. In contrast\nto previous studies, our results generalise across external datasets, devices,\nand environments. Our open-source model will help researchers and developers to\nbuild customisable and generalisable activity classifiers with high\nperformance.\n","authors":["Hang Yuan","Shing Chan","Andrew P. Creagh","Catherine Tong","David A. Clifton","Aiden Doherty"],"pdf_url":"https://arxiv.org/pdf/2206.02909v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12897v1","updated":"2023-07-24T15:44:30Z","published":"2023-07-24T15:44:30Z","title":"Anytime Model Selection in Linear Bandits","summary":"  Model selection in the context of bandit optimization is a challenging\nproblem, as it requires balancing exploration and exploitation not only for\naction selection, but also for model selection. One natural approach is to rely\non online learning algorithms that treat different models as experts. Existing\nmethods, however, scale poorly ($\\text{poly}M$) with the number of models $M$\nin terms of their regret. Our key insight is that, for model selection in\nlinear bandits, we can emulate full-information feedback to the online learner\nwith a favorable bias-variance trade-off. This allows us to develop ALEXP,\nwhich has an exponentially improved ($\\log M$) dependence on $M$ for its\nregret. ALEXP has anytime guarantees on its regret, and neither requires\nknowledge of the horizon $n$, nor relies on an initial purely exploratory\nstage. Our approach utilizes a novel time-uniform analysis of the Lasso,\nestablishing a new connection between online learning and high-dimensional\nstatistics.\n","authors":["Parnian Kassraie","Aldo Pacchiano","Nicolas Emmenegger","Andreas Krause"],"pdf_url":"https://arxiv.org/pdf/2307.12897v1.pdf","comment":"37 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.12892v1","updated":"2023-07-24T15:42:33Z","published":"2023-07-24T15:42:33Z","title":"A Statistical View of Column Subset Selection","summary":"  We consider the problem of selecting a small subset of representative\nvariables from a large dataset. In the computer science literature, this\ndimensionality reduction problem is typically formalized as Column Subset\nSelection (CSS). Meanwhile, the typical statistical formalization is to find an\ninformation-maximizing set of Principal Variables. This paper shows that these\ntwo approaches are equivalent, and moreover, both can be viewed as maximum\nlikelihood estimation within a certain semi-parametric model. Using these\nconnections, we show how to efficiently (1) perform CSS using only summary\nstatistics from the original dataset; (2) perform CSS in the presence of\nmissing and/or censored data; and (3) select the subset size for CSS in a\nhypothesis testing framework.\n","authors":["Anav Sood","Trevor Hastie"],"pdf_url":"https://arxiv.org/pdf/2307.12892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08649v3","updated":"2023-07-24T15:33:25Z","published":"2023-04-17T22:53:54Z","title":"Classification of US Supreme Court Cases using BERT-Based Techniques","summary":"  Models based on bidirectional encoder representations from transformers\n(BERT) produce state of the art (SOTA) results on many natural language\nprocessing (NLP) tasks such as named entity recognition (NER), part-of-speech\n(POS) tagging etc. An interesting phenomenon occurs when classifying long\ndocuments such as those from the US supreme court where BERT-based models can\nbe considered difficult to use on a first-pass or out-of-the-box basis. In this\npaper, we experiment with several BERT-based classification techniques for US\nsupreme court decisions or supreme court database (SCDB) and compare them with\nthe previous SOTA results. We then compare our results specifically with SOTA\nmodels for long documents. We compare our results for two classification tasks:\n(1) a broad classification task with 15 categories and (2) a fine-grained\nclassification task with 279 categories. Our best result produces an accuracy\nof 80\\% on the 15 broad categories and 60\\% on the fine-grained 279 categories\nwhich marks an improvement of 8\\% and 28\\% respectively from previously\nreported SOTA results.\n","authors":["Shubham Vatsal","Adam Meyers","John E. Ortega"],"pdf_url":"https://arxiv.org/pdf/2304.08649v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.13628v2","updated":"2023-07-24T15:31:05Z","published":"2021-08-31T05:38:36Z","title":"Learning Optimal Prescriptive Trees from Observational Data","summary":"  We consider the problem of learning an optimal prescriptive tree (i.e., an\ninterpretable treatment assignment policy in the form of a binary tree) of\nmoderate depth, from observational data. This problem arises in numerous\nsocially important domains such as public health and personalized medicine,\nwhere interpretable and data-driven interventions are sought based on data\ngathered in deployment -- through passive collection of data -- rather than\nfrom randomized trials. We propose a method for learning optimal prescriptive\ntrees using mixed-integer optimization (MIO) technology. We show that under\nmild conditions our method is asymptotically exact in the sense that it\nconverges to an optimal out-of-sample treatment assignment policy as the number\nof historical data samples tends to infinity. Contrary to existing literature,\nour approach: 1) does not require data to be randomized, 2) does not impose\nstringent assumptions on the learned trees, and 3) has the ability to model\ndomain specific constraints. Through extensive computational experiments, we\ndemonstrate that our asymptotic guarantees translate to significant performance\nimprovements in finite samples, as well as showcase our uniquely flexible\nmodeling power by incorporating budget and fairness constraints.\n","authors":["Nathanael Jo","Sina Aghaei","Andrés Gómez","Phebe Vayanos"],"pdf_url":"https://arxiv.org/pdf/2108.13628v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.11389v3","updated":"2023-07-24T15:28:34Z","published":"2022-08-24T09:26:12Z","title":"Approximate blocked Gibbs sampling for Bayesian neural networks","summary":"  In this work, minibatch MCMC sampling for feedforward neural networks is made\nmore feasible. To this end, it is proposed to sample subgroups of parameters\nvia a blocked Gibbs sampling scheme. By partitioning the parameter space,\nsampling is possible irrespective of layer width. It is also possible to\nalleviate vanishing acceptance rates for increasing depth by reducing the\nproposal variance in deeper layers. Increasing the length of a non-convergent\nchain increases the predictive accuracy in classification tasks, so avoiding\nvanishing acceptance rates and consequently enabling longer chain runs have\npractical benefits. Moreover, non-convergent chain realizations aid in the\nquantification of predictive uncertainty. An open problem is how to perform\nminibatch MCMC sampling for feedforward neural networks in the presence of\naugmented data.\n","authors":["Theodore Papamarkou"],"pdf_url":"https://arxiv.org/pdf/2208.11389v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.12803v3","updated":"2023-07-24T15:27:16Z","published":"2022-01-30T12:53:51Z","title":"Generalizing similarity in noisy setups: the DIBS phenomenon","summary":"  This work uncovers an interplay among data density, noise, and the\ngeneralization ability in similarity learning. We consider Siamese Neural\nNetworks (SNNs), which are the basic form of contrastive learning, and explore\ntwo types of noise that can impact SNNs, Pair Label Noise (PLN) and Single\nLabel Noise (SLN). Our investigation reveals that SNNs exhibit double descent\nbehaviour regardless of the training setup and that it is further exacerbated\nby noise. We demonstrate that the density of data pairs is crucial for\ngeneralization. When SNNs are trained on sparse datasets with the same amount\nof PLN or SLN, they exhibit comparable generalization properties. However, when\nusing dense datasets, PLN cases generalize worse than SLN ones in the\noverparametrized region, leading to a phenomenon we call Density-Induced Break\nof Similarity (DIBS). In this regime, PLN similarity violation becomes\nmacroscopical, corrupting the dataset to the point where complete interpolation\ncannot be achieved, regardless of the number of model parameters. Our analysis\nalso delves into the correspondence between online optimization and offline\ngeneralization in similarity learning. The results show that this equivalence\nfails in the presence of label noise in all the scenarios considered.\n","authors":["Nayara Fonseca","Veronica Guidetti"],"pdf_url":"https://arxiv.org/pdf/2201.12803v3.pdf","comment":"v3: version accepted at ECAI 2023 + Supplementary Material"},{"id":"http://arxiv.org/abs/2307.10490v3","updated":"2023-07-24T15:24:17Z","published":"2023-07-19T23:03:20Z","title":"(Ab)using Images and Sounds for Indirect Instruction Injection in\n  Multi-Modal LLMs","summary":"  We demonstrate how images and sounds can be used for indirect prompt and\ninstruction injection in multi-modal LLMs. An attacker generates an adversarial\nperturbation corresponding to the prompt and blends it into an image or audio\nrecording. When the user asks the (unmodified, benign) model about the\nperturbed image or audio, the perturbation steers the model to output the\nattacker-chosen text and/or make the subsequent dialog follow the attacker's\ninstruction. We illustrate this attack with several proof-of-concept examples\ntargeting LLaVa and PandaGPT.\n","authors":["Eugene Bagdasaryan","Tsung-Yin Hsieh","Ben Nassi","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2307.10490v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10891v2","updated":"2023-07-24T15:16:46Z","published":"2023-06-19T12:36:54Z","title":"Transformer Training Strategies for Forecasting Multiple Load Time\n  Series","summary":"  In the smart grid of the future, accurate load forecasts on the level of\nindividual clients can help to balance supply and demand locally and to prevent\ngrid outages. While the number of monitored clients will increase with the\nongoing smart meter rollout, the amount of data per client will always be\nlimited. We evaluate whether a Transformer load forecasting model benefits from\na transfer learning strategy, where a global univariate model is trained on the\nload time series from multiple clients. In experiments with two datasets\ncontaining load time series from several hundred clients, we find that the\nglobal training strategy is superior to the multivariate and local training\nstrategies used in related work. On average, the global training strategy\nresults in 21.8% and 12.8% lower forecasting errors than the two other\nstrategies, measured across forecasting horizons from one day to one month into\nthe future. A comparison to linear models, multi-layer perceptrons and LSTMs\nshows that Transformers are effective for load forecasting when they are\ntrained with the global training strategy.\n","authors":["Matthias Hertel","Maximilian Beichter","Benedikt Heidrich","Oliver Neumann","Benjamin Schäfer","Ralf Mikut","Veit Hagenmeyer"],"pdf_url":"https://arxiv.org/pdf/2306.10891v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12872v1","updated":"2023-07-24T15:10:22Z","published":"2023-07-24T15:10:22Z","title":"Data-free Black-box Attack based on Diffusion Model","summary":"  Since the training data for the target model in a data-free black-box attack\nis not available, most recent schemes utilize GANs to generate data for\ntraining substitute model. However, these GANs-based schemes suffer from low\ntraining efficiency as the generator needs to be retrained for each target\nmodel during the substitute training process, as well as low generation\nquality. To overcome these limitations, we consider utilizing the diffusion\nmodel to generate data, and propose a data-free black-box attack scheme based\non diffusion model to improve the efficiency and accuracy of substitute\ntraining. Despite the data generated by the diffusion model exhibits high\nquality, it presents diverse domain distributions and contains many samples\nthat do not meet the discriminative criteria of the target model. To further\nfacilitate the diffusion model to generate data suitable for the target model,\nwe propose a Latent Code Augmentation (LCA) method to guide the diffusion model\nin generating data. With the guidance of LCA, the data generated by the\ndiffusion model not only meets the discriminative criteria of the target model\nbut also exhibits high diversity. By utilizing this data, it is possible to\ntrain substitute model that closely resemble the target model more efficiently.\nExtensive experiments demonstrate that our LCA achieves higher attack success\nrates and requires fewer query budgets compared to GANs-based schemes for\ndifferent target models.\n","authors":["Mingwen Shao","Lingzhuang Meng","Yuanjian Qiao","Lixu Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2307.12872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12862v1","updated":"2023-07-24T15:02:03Z","published":"2023-07-24T15:02:03Z","title":"Stochastic Step-wise Feature Selection for Exponential Random Graph\n  Models (ERGMs)","summary":"  Statistical analysis of social networks provides valuable insights into\ncomplex network interactions across various scientific disciplines. However,\naccurate modeling of networks remains challenging due to the heavy\ncomputational burden and the need to account for observed network dependencies.\nExponential Random Graph Models (ERGMs) have emerged as a promising technique\nused in social network modeling to capture network dependencies by\nincorporating endogenous variables. Nevertheless, using ERGMs poses multiple\nchallenges, including the occurrence of ERGM degeneracy, which generates\nunrealistic and meaningless network structures. To address these challenges and\nenhance the modeling of collaboration networks, we propose and test a novel\napproach that focuses on endogenous variable selection within ERGMs. Our method\naims to overcome the computational burden and improve the accommodation of\nobserved network dependencies, thereby facilitating more accurate and\nmeaningful interpretations of network phenomena in various scientific fields.\nWe conduct empirical testing and rigorous analysis to contribute to the\nadvancement of statistical techniques and offer practical insights for network\nanalysis.\n","authors":["Helal El-Zaatari","Fei Yu","Michael R Kosorok"],"pdf_url":"https://arxiv.org/pdf/2307.12862v1.pdf","comment":"23 pages, 6 tables and 18 figures"},{"id":"http://arxiv.org/abs/2307.12856v1","updated":"2023-07-24T14:56:30Z","published":"2023-07-24T14:56:30Z","title":"A Real-World WebAgent with Planning, Long Context Understanding, and\n  Program Synthesis","summary":"  Pre-trained large language models (LLMs) have recently achieved better\ngeneralization and sample efficiency in autonomous web navigation. However, the\nperformance on real-world websites has still suffered from (1) open domainness,\n(2) limited context length, and (3) lack of inductive bias on HTML. We\nintroduce WebAgent, an LLM-driven agent that can complete the tasks on real\nwebsites following natural language instructions. WebAgent plans ahead by\ndecomposing instructions into canonical sub-instructions, summarizes long HTML\ndocuments into task-relevant snippets, and acts on websites via generated\nPython programs from those. We design WebAgent with Flan-U-PaLM, for grounded\ncode generation, and HTML-T5, new pre-trained LLMs for long HTML documents\nusing local and global attention mechanisms and a mixture of long-span\ndenoising objectives, for planning and summarization. We empirically\ndemonstrate that our recipe improves the success on a real website by over 50%,\nand that HTML-T5 is the best model to solve HTML-based tasks; achieving 14.9%\nhigher success rate than prior SoTA on the MiniWoB web navigation benchmark and\nbetter accuracy on offline task planning evaluation.\n","authors":["Izzeddin Gur","Hiroki Furuta","Austin Huang","Mustafa Safdari","Yutaka Matsuo","Douglas Eck","Aleksandra Faust"],"pdf_url":"https://arxiv.org/pdf/2307.12856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12851v1","updated":"2023-07-24T14:51:54Z","published":"2023-07-24T14:51:54Z","title":"Early Neuron Alignment in Two-layer ReLU Networks with Small\n  Initialization","summary":"  This paper studies the problem of training a two-layer ReLU network for\nbinary classification using gradient flow with small initialization. We\nconsider a training dataset with well-separated input vectors: Any pair of\ninput data with the same label are positively correlated, and any pair with\ndifferent labels are negatively correlated. Our analysis shows that, during the\nearly phase of training, neurons in the first layer try to align with either\nthe positive data or the negative data, depending on its corresponding weight\non the second layer. A careful analysis of the neurons' directional dynamics\nallows us to provide an $\\mathcal{O}(\\frac{\\log n}{\\sqrt{\\mu}})$ upper bound on\nthe time it takes for all neurons to achieve good alignment with the input\ndata, where $n$ is the number of data points and $\\mu$ measures how well the\ndata are separated. After the early alignment phase, the loss converges to zero\nat a $\\mathcal{O}(\\frac{1}{t})$ rate, and the weight matrix on the first layer\nis approximately low-rank. Numerical experiments on the MNIST dataset\nillustrate our theoretical findings.\n","authors":["Hancheng Min","René Vidal","Enrique Mallada"],"pdf_url":"https://arxiv.org/pdf/2307.12851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12840v1","updated":"2023-07-24T14:37:22Z","published":"2023-07-24T14:37:22Z","title":"Efficiently Learning One-Hidden-Layer ReLU Networks via Schur\n  Polynomials","summary":"  We study the problem of PAC learning a linear combination of $k$ ReLU\nactivations under the standard Gaussian distribution on $\\mathbb{R}^d$ with\nrespect to the square loss. Our main result is an efficient algorithm for this\nlearning task with sample and computational complexity $(dk/\\epsilon)^{O(k)}$,\nwhere $\\epsilon>0$ is the target accuracy. Prior work had given an algorithm\nfor this problem with complexity $(dk/\\epsilon)^{h(k)}$, where the function\n$h(k)$ scales super-polynomially in $k$. Interestingly, the complexity of our\nalgorithm is near-optimal within the class of Correlational Statistical Query\nalgorithms. At a high-level, our algorithm uses tensor decomposition to\nidentify a subspace such that all the $O(k)$-order moments are small in the\northogonal directions. Its analysis makes essential use of the theory of Schur\npolynomials to show that the higher-moment error tensors are small given that\nthe lower-order ones are.\n","authors":["Ilias Diakonikolas","Daniel M. Kane"],"pdf_url":"https://arxiv.org/pdf/2307.12840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08272v3","updated":"2023-07-24T14:28:11Z","published":"2023-03-14T23:26:55Z","title":"Automated patent extraction powers generative modeling in focused\n  chemical spaces","summary":"  Deep generative models have emerged as an exciting avenue for inverse\nmolecular design, with progress coming from the interplay between training\nalgorithms and molecular representations. One of the key challenges in their\napplicability to materials science and chemistry has been the lack of access to\nsizeable training datasets with property labels. Published patents contain the\nfirst disclosure of new materials prior to their publication in journals, and\nare a vast source of scientific knowledge that has remained relatively untapped\nin the field of data-driven molecular design. Because patents are filed seeking\nto protect specific uses, molecules in patents can be considered to be weakly\nlabeled into application classes. Furthermore, patents published by the US\nPatent and Trademark Office (USPTO) are downloadable and have machine-readable\ntext and molecular structures. In this work, we train domain-specific\ngenerative models using patent data sources by developing an automated pipeline\nto go from USPTO patent digital files to the generation of novel candidates\nwith minimal human intervention. We test the approach on two in-class extracted\ndatasets, one in organic electronics and another in tyrosine kinase inhibitors.\nWe then evaluate the ability of generative models trained on these in-class\ndatasets on two categories of tasks (distribution learning and property\noptimization), identify strengths and limitations, and suggest possible\nexplanations and remedies that could be used to overcome these in practice.\n","authors":["Akshay Subramanian","Kevin P. Greenman","Alexis Gervaix","Tzuhsiung Yang","Rafael Gómez-Bombarelli"],"pdf_url":"https://arxiv.org/pdf/2303.08272v3.pdf","comment":"Digital Discovery (2023)"},{"id":"http://arxiv.org/abs/2307.02620v2","updated":"2023-07-24T14:21:09Z","published":"2023-07-05T19:48:03Z","title":"Learning when to observe: A frugal reinforcement learning framework for\n  a high-cost world","summary":"  Reinforcement learning (RL) has been shown to learn sophisticated control\npolicies for complex tasks including games, robotics, heating and cooling\nsystems and text generation. The action-perception cycle in RL, however,\ngenerally assumes that a measurement of the state of the environment is\navailable at each time step without a cost. In applications such as materials\ndesign, deep-sea and planetary robot exploration and medicine, however, there\ncan be a high cost associated with measuring, or even approximating, the state\nof the environment. In this paper, we survey the recently growing literature\nthat adopts the perspective that an RL agent might not need, or even want, a\ncostly measurement at each time step. Within this context, we propose the Deep\nDynamic Multi-Step Observationless Agent (DMSOA), contrast it with the\nliterature and empirically evaluate it on OpenAI gym and Atari Pong\nenvironments. Our results, show that DMSOA learns a better policy with fewer\ndecision steps and measurements than the considered alternative from the\nliterature. The corresponding code is available at:\n\\url{https://github.com/cbellinger27/Learning-when-to-observe-in-RL\n","authors":["Colin Bellinger","Mark Crowley","Isaac Tamblyn"],"pdf_url":"https://arxiv.org/pdf/2307.02620v2.pdf","comment":"Accepted for presentation at ECML-PKDD 2023 workshop track:\n  Simplification, Compression, Efficiency and Frugality for Artificial\n  Intelligence (SCEFA)"},{"id":"http://arxiv.org/abs/2307.12822v1","updated":"2023-07-24T14:19:36Z","published":"2023-07-24T14:19:36Z","title":"Learning Provably Robust Estimators for Inverse Problems via Jittering","summary":"  Deep neural networks provide excellent performance for inverse problems such\nas denoising. However, neural networks can be sensitive to adversarial or\nworst-case perturbations. This raises the question of whether such networks can\nbe trained efficiently to be worst-case robust. In this paper, we investigate\nwhether jittering, a simple regularization technique that adds isotropic\nGaussian noise during training, is effective for learning worst-case robust\nestimators for inverse problems. While well studied for prediction in\nclassification tasks, the effectiveness of jittering for inverse problems has\nnot been systematically investigated. In this paper, we present a novel\nanalytical characterization of the optimal $\\ell_2$-worst-case robust estimator\nfor linear denoising and show that jittering yields optimal robust denoisers.\nFurthermore, we examine jittering empirically via training deep neural networks\n(U-nets) for natural image denoising, deconvolution, and accelerated magnetic\nresonance imaging (MRI). The results show that jittering significantly enhances\nthe worst-case robustness, but can be suboptimal for inverse problems beyond\ndenoising. Moreover, our results imply that training on real data which often\ncontains slight noise is somewhat robustness enhancing.\n","authors":["Anselm Krainovic","Mahdi Soltanolkotabi","Reinhard Heckel"],"pdf_url":"https://arxiv.org/pdf/2307.12822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02813v2","updated":"2023-07-24T14:17:24Z","published":"2023-07-06T07:18:22Z","title":"CPDG: A Contrastive Pre-Training Method for Dynamic Graph Neural\n  Networks","summary":"  Dynamic graph data mining has gained popularity in recent years due to the\nrich information contained in dynamic graphs and their widespread use in the\nreal world. Despite the advances in dynamic graph neural networks (DGNNs), the\nrich information and diverse downstream tasks have posed significant\ndifficulties for the practical application of DGNNs in industrial scenarios. To\nthis end, in this paper, we propose to address them by pre-training and present\nthe Contrastive Pre-Training Method for Dynamic Graph Neural Networks (CPDG).\nCPDG tackles the challenges of pre-training for DGNNs, including generalization\ncapability and long-short term modeling capability, through a flexible\nstructural-temporal subgraph sampler along with structural-temporal contrastive\npre-training schemes. Extensive experiments conducted on both large-scale\nresearch and industrial dynamic graph datasets show that CPDG outperforms\nexisting methods in dynamic graph pre-training for various downstream tasks\nunder three transfer settings.\n","authors":["Yuanchen Bei","Hao Xu","Sheng Zhou","Huixuan Chi","Haishuai Wang","Mengdi Zhang","Zhao Li","Jiajun Bu"],"pdf_url":"https://arxiv.org/pdf/2307.02813v2.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.12797v1","updated":"2023-07-24T13:46:50Z","published":"2023-07-24T13:46:50Z","title":"Causal Fair Machine Learning via Rank-Preserving Interventional\n  Distributions","summary":"  A decision can be defined as fair if equal individuals are treated equally\nand unequals unequally. Adopting this definition, the task of designing machine\nlearning models that mitigate unfairness in automated decision-making systems\nmust include causal thinking when introducing protected attributes. Following a\nrecent proposal, we define individuals as being normatively equal if they are\nequal in a fictitious, normatively desired (FiND) world, where the protected\nattribute has no (direct or indirect) causal effect on the target. We propose\nrank-preserving interventional distributions to define an estimand of this FiND\nworld and a warping method for estimation. Evaluation criteria for both the\nmethod and resulting model are presented and validated through simulations and\nempirical data. With this, we show that our warping approach effectively\nidentifies the most discriminated individuals and mitigates unfairness.\n","authors":["Ludwig Bothmann","Susanne Dandl","Michael Schomaker"],"pdf_url":"https://arxiv.org/pdf/2307.12797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.05018v3","updated":"2023-07-24T13:46:46Z","published":"2022-07-11T17:13:10Z","title":"Learning Temporally Extended Skills in Continuous Domains as Symbolic\n  Actions for Planning","summary":"  Problems which require both long-horizon planning and continuous control\ncapabilities pose significant challenges to existing reinforcement learning\nagents. In this paper we introduce a novel hierarchical reinforcement learning\nagent which links temporally extended skills for continuous control with a\nforward model in a symbolic discrete abstraction of the environment's state for\nplanning. We term our agent SEADS for Symbolic Effect-Aware Diverse Skills. We\nformulate an objective and corresponding algorithm which leads to unsupervised\nlearning of a diverse set of skills through intrinsic motivation given a known\nstate abstraction. The skills are jointly learned with the symbolic forward\nmodel which captures the effect of skill execution in the state abstraction.\nAfter training, we can leverage the skills as symbolic actions using the\nforward model for long-horizon planning and subsequently execute the plan using\nthe learned continuous-action control skills. The proposed algorithm learns\nskills and forward models that can be used to solve complex tasks which require\nboth continuous control and long-horizon planning capabilities with high\nsuccess rate. It compares favorably with other flat and hierarchical\nreinforcement learning baseline agents and is successfully demonstrated with a\nreal robot.\n","authors":["Jan Achterhold","Markus Krimmel","Joerg Stueckler"],"pdf_url":"https://arxiv.org/pdf/2207.05018v3.pdf","comment":"Project website (including video) is available at\n  https://seads.is.tue.mpg.de/. (v2) Accepted for publication at the 6th\n  Conference on Robot Learning (CoRL) 2022, Auckland, New Zealand. (v3) Added\n  details on checkpointing (S.8.1), with references on p.7, p.8, p.21 to\n  clarify number of env. steps of reported results"},{"id":"http://arxiv.org/abs/2307.12790v1","updated":"2023-07-24T13:39:21Z","published":"2023-07-24T13:39:21Z","title":"Compact & Capable: Harnessing Graph Neural Networks and Edge Convolution\n  for Medical Image Classification","summary":"  Graph-based neural network models are gaining traction in the field of\nrepresentation learning due to their ability to uncover latent topological\nrelationships between entities that are otherwise challenging to identify.\nThese models have been employed across a diverse range of domains, encompassing\ndrug discovery, protein interactions, semantic segmentation, and fluid dynamics\nresearch. In this study, we investigate the potential of Graph Neural Networks\n(GNNs) for medical image classification. We introduce a novel model that\ncombines GNNs and edge convolution, leveraging the interconnectedness of RGB\nchannel feature values to strongly represent connections between crucial graph\nnodes. Our proposed model not only performs on par with state-of-the-art Deep\nNeural Networks (DNNs) but does so with 1000 times fewer parameters, resulting\nin reduced training time and data requirements. We compare our Graph\nConvolutional Neural Network (GCNN) to pre-trained DNNs for classifying\nMedMNIST dataset classes, revealing promising prospects for GNNs in medical\nimage analysis. Our results also encourage further exploration of advanced\ngraph-based models such as Graph Attention Networks (GAT) and Graph\nAuto-Encoders in the medical imaging domain. The proposed model yields more\nreliable, interpretable, and accurate outcomes for tasks like semantic\nsegmentation and image classification compared to simpler GCNNs\n","authors":["Aryan Singh","Pepijn Van de Ven","Ciarán Eising","Patrick Denny"],"pdf_url":"https://arxiv.org/pdf/2307.12790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.13170v4","updated":"2023-07-24T13:35:28Z","published":"2022-04-27T20:04:24Z","title":"AdaBest: Minimizing Client Drift in Federated Learning via Adaptive Bias\n  Estimation","summary":"  In Federated Learning (FL), a number of clients or devices collaborate to\ntrain a model without sharing their data. Models are optimized locally at each\nclient and further communicated to a central hub for aggregation. While FL is\nan appealing decentralized training paradigm, heterogeneity among data from\ndifferent clients can cause the local optimization to drift away from the\nglobal objective. In order to estimate and therefore remove this drift,\nvariance reduction techniques have been incorporated into FL optimization\nrecently. However, these approaches inaccurately estimate the clients' drift\nand ultimately fail to remove it properly. In this work, we propose an adaptive\nalgorithm that accurately estimates drift across clients. In comparison to\nprevious works, our approach necessitates less storage and communication\nbandwidth, as well as lower compute costs. Additionally, our proposed\nmethodology induces stability by constraining the norm of estimates for client\ndrift, making it more practical for large scale FL. Experimental findings\ndemonstrate that the proposed algorithm converges significantly faster and\nachieves higher accuracy than the baselines across various FL benchmarks.\n","authors":["Farshid Varno","Marzie Saghayi","Laya Rafiee Sevyeri","Sharut Gupta","Stan Matwin","Mohammad Havaei"],"pdf_url":"https://arxiv.org/pdf/2204.13170v4.pdf","comment":"Published as a conference paper at ECCV 2022; Corrected some typos in\n  the text and a baseline algorithm"},{"id":"http://arxiv.org/abs/2307.12788v1","updated":"2023-07-24T13:35:18Z","published":"2023-07-24T13:35:18Z","title":"Analyzing the Strategy of Propaganda using Inverse Reinforcement\n  Learning: Evidence from the 2022 Russian Invasion of Ukraine","summary":"  The 2022 Russian invasion of Ukraine was accompanied by a large-scale,\npro-Russian propaganda campaign on social media. However, the strategy behind\nthe dissemination of propaganda has remained unclear, particularly how the\nonline discourse was strategically shaped by the propagandists' community.\nHere, we analyze the strategy of the Twitter community using an inverse\nreinforcement learning (IRL) approach. Specifically, IRL allows us to model\nonline behavior as a Markov decision process, where the goal is to infer the\nunderlying reward structure that guides propagandists when interacting with\nusers with a supporting or opposing stance toward the invasion. Thereby, we aim\nto understand empirically whether and how between-user interactions are\nstrategically used to promote the proliferation of Russian propaganda. For\nthis, we leverage a large-scale dataset with 349,455 posts with pro-Russian\npropaganda from 132,131 users. We show that bots and humans follow a different\nstrategy: bots respond predominantly to pro-invasion messages, suggesting that\nthey seek to drive virality; while messages indicating opposition primarily\nelicit responses from humans, suggesting that they tend to engage in critical\ndiscussions. To the best of our knowledge, this is the first study analyzing\nthe strategy behind propaganda from the 2022 Russian invasion of Ukraine\nthrough the lens of IRL.\n","authors":["Dominique Geissler","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2307.12788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12540v2","updated":"2023-07-24T13:35:16Z","published":"2023-03-22T13:16:37Z","title":"Deployment of Image Analysis Algorithms under Prevalence Shifts","summary":"  Domain gaps are among the most relevant roadblocks in the clinical\ntranslation of machine learning (ML)-based solutions for medical image\nanalysis. While current research focuses on new training paradigms and network\narchitectures, little attention is given to the specific effect of prevalence\nshifts on an algorithm deployed in practice. Such discrepancies between class\nfrequencies in the data used for a method's development/validation and that in\nits deployment environment(s) are of great importance, for example in the\ncontext of artificial intelligence (AI) democratization, as disease prevalences\nmay vary widely across time and location. Our contribution is twofold. First,\nwe empirically demonstrate the potentially severe consequences of missing\nprevalence handling by analyzing (i) the extent of miscalibration, (ii) the\ndeviation of the decision threshold from the optimum, and (iii) the ability of\nvalidation metrics to reflect neural network performance on the deployment\npopulation as a function of the discrepancy between development and deployment\nprevalence. Second, we propose a workflow for prevalence-aware image\nclassification that uses estimated deployment prevalences to adjust a trained\nclassifier to a new environment, without requiring additional annotated\ndeployment data. Comprehensive experiments based on a diverse set of 30 medical\nclassification tasks showcase the benefit of the proposed workflow in\ngenerating better classifier decisions and more reliable performance estimates\ncompared to current practice.\n","authors":["Patrick Godau","Piotr Kalinowski","Evangelia Christodoulou","Annika Reinke","Minu Tizabi","Luciana Ferrer","Paul Jäger","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2303.12540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12775v1","updated":"2023-07-24T13:24:56Z","published":"2023-07-24T13:24:56Z","title":"Is attention all you need in medical image analysis? A review","summary":"  Medical imaging is a key component in clinical diagnosis, treatment planning\nand clinical trial design, accounting for almost 90% of all healthcare data.\nCNNs achieved performance gains in medical image analysis (MIA) over the last\nyears. CNNs can efficiently model local pixel interactions and be trained on\nsmall-scale MI data. The main disadvantage of typical CNN models is that they\nignore global pixel relationships within images, which limits their\ngeneralisation ability to understand out-of-distribution data with different\n'global' information. The recent progress of Artificial Intelligence gave rise\nto Transformers, which can learn global relationships from data. However, full\nTransformer models need to be trained on large-scale data and involve\ntremendous computational complexity. Attention and Transformer compartments\n(Transf/Attention) which can well maintain properties for modelling global\nrelationships, have been proposed as lighter alternatives of full Transformers.\nRecently, there is an increasing trend to co-pollinate complementary\nlocal-global properties from CNN and Transf/Attention architectures, which led\nto a new era of hybrid models. The past years have witnessed substantial growth\nin hybrid CNN-Transf/Attention models across diverse MIA problems. In this\nsystematic review, we survey existing hybrid CNN-Transf/Attention models,\nreview and unravel key architectural designs, analyse breakthroughs, and\nevaluate current and future opportunities as well as challenges. We also\nintroduced a comprehensive analysis framework on generalisation opportunities\nof scientific and clinical impact, based on which new data-driven domain\ngeneralisation and adaptation methods can be stimulated.\n","authors":["Giorgos Papanastasiou","Nikolaos Dikaios","Jiahao Huang","Chengjia Wang","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12771v1","updated":"2023-07-24T13:19:15Z","published":"2023-07-24T13:19:15Z","title":"Detecting disturbances in network-coupled dynamical systems with machine\n  learning","summary":"  Identifying disturbances in network-coupled dynamical systems without\nknowledge of the disturbances or underlying dynamics is a problem with a wide\nrange of applications. For example, one might want to know which nodes in the\nnetwork are being disturbed and identify the type of disturbance. Here we\npresent a model-free method based on machine learning to identify such unknown\ndisturbances based only on prior observations of the system when forced by a\nknown training function. We find that this method is able to identify the\nlocations and properties of many different types of unknown disturbances using\na variety of known forcing functions. We illustrate our results both with\nlinear and nonlinear disturbances using food web and neuronal activity models.\nFinally, we discuss how to scale our method to large networks.\n","authors":["Per Sebastian Skardal","Juan G. Restrepo"],"pdf_url":"https://arxiv.org/pdf/2307.12771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05732v6","updated":"2023-07-24T13:15:14Z","published":"2022-09-13T04:58:35Z","title":"Rényi Divergence Deep Mutual Learning","summary":"  This paper revisits Deep Mutual Learning (DML), a simple yet effective\ncomputing paradigm. We propose using R\\'{e}nyi divergence instead of the KL\ndivergence, which is more flexible and tunable, to improve vanilla DML. This\nmodification is able to consistently improve performance over vanilla DML with\nlimited additional complexity. The convergence properties of the proposed\nparadigm are analyzed theoretically, and Stochastic Gradient Descent with a\nconstant learning rate is shown to converge with $\\mathcal{O}(1)$-bias in the\nworst case scenario for nonconvex optimization tasks. That is, learning will\nreach nearby local optima but continue searching within a bounded scope, which\nmay help mitigate overfitting. Finally, our extensive empirical results\ndemonstrate the advantage of combining DML and R\\'{e}nyi divergence, leading to\nfurther improvement in model generalization.\n","authors":["Weipeng Huang","Junjie Tao","Changbo Deng","Ming Fan","Wenqiang Wan","Qi Xiong","Guangyuan Piao"],"pdf_url":"https://arxiv.org/pdf/2209.05732v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.11531v2","updated":"2023-07-24T13:04:48Z","published":"2022-09-23T11:36:32Z","title":"Deep Learning-based Anonymization of Chest Radiographs: A\n  Utility-preserving Measure for Patient Privacy","summary":"  Robust and reliable anonymization of chest radiographs constitutes an\nessential step before publishing large datasets of such for research purposes.\nThe conventional anonymization process is carried out by obscuring personal\ninformation in the images with black boxes and removing or replacing\nmeta-information. However, such simple measures retain biometric information in\nthe chest radiographs, allowing patients to be re-identified by a linkage\nattack. Therefore, there is an urgent need to obfuscate the biometric\ninformation appearing in the images. We propose the first deep learning-based\napproach (PriCheXy-Net) to targetedly anonymize chest radiographs while\nmaintaining data utility for diagnostic and machine learning purposes. Our\nmodel architecture is a composition of three independent neural networks that,\nwhen collectively used, allow for learning a deformation field that is able to\nimpede patient re-identification. Quantitative results on the ChestX-ray14\ndataset show a reduction of patient re-identification from 81.8% to 57.7% (AUC)\nafter re-training with little impact on the abnormality classification\nperformance. This indicates the ability to preserve underlying abnormality\npatterns while increasing patient privacy. Lastly, we compare our proposed\nanonymization approach with two other obfuscation-based methods (Privacy-Net,\nDP-Pix) and demonstrate the superiority of our method towards resolving the\nprivacy-utility trade-off for chest radiographs.\n","authors":["Kai Packhäuser","Sebastian Gündel","Florian Thamm","Felix Denzinger","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2209.11531v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.07620v2","updated":"2023-07-24T13:03:17Z","published":"2023-07-14T20:39:07Z","title":"Generalizable Embeddings with Cross-batch Metric Learning","summary":"  Global average pooling (GAP) is a popular component in deep metric learning\n(DML) for aggregating features. Its effectiveness is often attributed to\ntreating each feature vector as a distinct semantic entity and GAP as a\ncombination of them. Albeit substantiated, such an explanation's algorithmic\nimplications to learn generalizable entities to represent unseen classes, a\ncrucial DML goal, remain unclear. To address this, we formulate GAP as a convex\ncombination of learnable prototypes. We then show that the prototype learning\ncan be expressed as a recursive process fitting a linear predictor to a batch\nof samples. Building on that perspective, we consider two batches of disjoint\nclasses at each iteration and regularize the learning by expressing the samples\nof a batch with the prototypes that are fitted to the other batch. We validate\nour approach on 4 popular DML benchmarks.\n","authors":["Yeti Z. Gurbuz","A. Aydin Alatan"],"pdf_url":"https://arxiv.org/pdf/2307.07620v2.pdf","comment":"\\c{opyright} 2023 IEEE. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses, in any current or\n  future media, including reprinting/republishing this material for advertising\n  or promotional purposes, creating new collective works, for resale or\n  redistribution to servers or lists, or reuse of any copyrighted component of\n  this work in other works"},{"id":"http://arxiv.org/abs/2212.07368v3","updated":"2023-07-24T12:53:23Z","published":"2022-12-14T17:46:17Z","title":"Shuffled Multi-Channel Sparse Signal Recovery","summary":"  Mismatches between samples and their respective channel or target commonly\narise in several real-world applications. For instance, whole-brain calcium\nimaging of freely moving organisms, multiple-target tracking or multi-person\ncontactless vital sign monitoring may be severely affected by mismatched\nsample-channel assignments. To systematically address this fundamental problem,\nwe pose it as a signal reconstruction problem where we have lost\ncorrespondences between the samples and their respective channels. Assuming\nthat we have a sensing matrix for the underlying signals, we show that the\nproblem is equivalent to a structured unlabeled sensing problem, and establish\nsufficient conditions for unique recovery. To the best of our knowledge, a\nsampling result for the reconstruction of shuffled multi-channel signals has\nnot been considered in the literature and existing methods for unlabeled\nsensing cannot be directly applied. We extend our results to the case where the\nsignals admit a sparse representation in an overcomplete dictionary (i.e., the\nsensing matrix is not precisely known), and derive sufficient conditions for\nthe reconstruction of shuffled sparse signals. We propose a robust\nreconstruction method that combines sparse signal recovery with robust linear\nregression for the two-channel case. The performance and robustness of the\nproposed approach is illustrated in an application related to whole-brain\ncalcium imaging. The proposed methodology can be generalized to sparse signal\nrepresentations other than the ones considered in this work to be applied in a\nvariety of real-world problems with imprecise measurement or channel\nassignment.\n","authors":["Taulant Koka","Manolis C. Tsakiris","Michael Muma","Benjamín Béjar Haro"],"pdf_url":"https://arxiv.org/pdf/2212.07368v3.pdf","comment":"Submitted to TSP"},{"id":"http://arxiv.org/abs/2307.12754v1","updated":"2023-07-24T12:52:55Z","published":"2023-07-24T12:52:55Z","title":"Nonparametric Linear Feature Learning in Regression Through\n  Regularisation","summary":"  Representation learning plays a crucial role in automated feature selection,\nparticularly in the context of high-dimensional data, where non-parametric\nmethods often struggle. In this study, we focus on supervised learning\nscenarios where the pertinent information resides within a lower-dimensional\nlinear subspace of the data, namely the multi-index model. If this subspace\nwere known, it would greatly enhance prediction, computation, and\ninterpretation. To address this challenge, we propose a novel method for linear\nfeature learning with non-parametric prediction, which simultaneously estimates\nthe prediction function and the linear subspace. Our approach employs empirical\nrisk minimisation, augmented with a penalty on function derivatives, ensuring\nversatility. Leveraging the orthogonality and rotation invariance properties of\nHermite polynomials, we introduce our estimator, named RegFeaL. By utilising\nalternative minimisation, we iteratively rotate the data to improve alignment\nwith leading directions and accurately estimate the relevant dimension in\npractical settings. We establish that our method yields a consistent estimator\nof the prediction function with explicit rates. Additionally, we provide\nempirical results demonstrating the performance of RegFeaL in various\nexperiments.\n","authors":["Bertille Follain","Umut Simsekli","Francis Bach"],"pdf_url":"https://arxiv.org/pdf/2307.12754v1.pdf","comment":"43 pages, 16 figures"},{"id":"http://arxiv.org/abs/2307.12745v1","updated":"2023-07-24T12:36:05Z","published":"2023-07-24T12:36:05Z","title":"Concept-based explainability for an EEG transformer model","summary":"  Deep learning models are complex due to their size, structure, and inherent\nrandomness in training procedures. Additional complexity arises from the\nselection of datasets and inductive biases. Addressing these challenges for\nexplainability, Kim et al. (2018) introduced Concept Activation Vectors (CAVs),\nwhich aim to understand deep models' internal states in terms of human-aligned\nconcepts. These concepts correspond to directions in latent space, identified\nusing linear discriminants. Although this method was first applied to image\nclassification, it was later adapted to other domains, including natural\nlanguage processing. In this work, we attempt to apply the method to\nelectroencephalogram (EEG) data for explainability in Kostas et al.'s BENDR\n(2021), a large-scale transformer model. A crucial part of this endeavor\ninvolves defining the explanatory concepts and selecting relevant datasets to\nground concepts in the latent space. Our focus is on two mechanisms for EEG\nconcept formation: the use of externally labeled EEG datasets, and the\napplication of anatomically defined concepts. The former approach is a\nstraightforward generalization of methods used in image classification, while\nthe latter is novel and specific to EEG. We present evidence that both\napproaches to concept formation yield valuable insights into the\nrepresentations learned by deep EEG models.\n","authors":["Anders Gjølbye Madsen","William Theodor Lehn-Schiøler","Áshildur Jónsdóttir","Bergdís Arnardóttir","Lars Kai Hansen"],"pdf_url":"https://arxiv.org/pdf/2307.12745v1.pdf","comment":"To appear in proceedings of 2023 IEEE International workshop on\n  Machine Learning for Signal Processing"},{"id":"http://arxiv.org/abs/2207.09657v3","updated":"2023-07-24T12:35:18Z","published":"2022-07-20T05:22:26Z","title":"Reducing Training Time in Cross-Silo Federated Learning using Multigraph\n  Topology","summary":"  Federated learning is an active research topic since it enables several\nparticipants to jointly train a model without sharing local data. Currently,\ncross-silo federated learning is a popular training setting that utilizes a few\nhundred reliable data silos with high-speed access links to training a model.\nWhile this approach has been widely applied in real-world scenarios, designing\na robust topology to reduce the training time remains an open problem. In this\npaper, we present a new multigraph topology for cross-silo federated learning.\nWe first construct the multigraph using the overlay graph. We then parse this\nmultigraph into different simple graphs with isolated nodes. The existence of\nisolated nodes allows us to perform model aggregation without waiting for other\nnodes, hence effectively reducing the training time. Intensive experiments on\nthree public datasets show that our proposed method significantly reduces the\ntraining time compared with recent state-of-the-art topologies while\nmaintaining the accuracy of the learned model. Our code can be found at\nhttps://github.com/aioz-ai/MultigraphFL\n","authors":["Tuong Do","Binh X. Nguyen","Vuong Pham","Toan Tran","Erman Tjiputra","Quang Tran","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2207.09657v3.pdf","comment":"accepted in ICCV 2023"},{"id":"http://arxiv.org/abs/2302.09629v2","updated":"2023-07-24T12:33:09Z","published":"2023-02-19T17:15:56Z","title":"BiofilmScanner: A Computational Intelligence Approach to Obtain\n  Bacterial Cell Morphological Attributes from Biofilm Image","summary":"  Desulfovibrio alaskensis G20 (DA-G20) is utilized as a model for\nsulfate-reducing bacteria (SRB) that are associated with corrosion issues\ncaused by microorganisms. SRB-based biofilms are thought to be responsible for\nthe billion-dollar-per-year bio-corrosion of metal infrastructure.\nUnderstanding the extraction of the bacterial cells' shape and size properties\nin the SRB-biofilm at different growth stages will assist with the design of\nanti-corrosion techniques. However, numerous issues affect current approaches,\nincluding time-consuming geometric property extraction, low efficiency, and\nhigh error rates. This paper proposes BiofilScanner, a Yolact-based deep\nlearning method integrated with invariant moments to address these problems.\nOur approach efficiently detects and segments bacterial cells in an SRB image\nwhile simultaneously invariant moments measure the geometric characteristics of\nthe segmented cells with low errors. The numerical experiments of the proposed\nmethod demonstrate that the BiofilmScanner is 2.1x and 6.8x faster than our\nearlier Mask-RCNN and DLv3+ methods for detecting, segmenting, and measuring\nthe geometric properties of the cell. Furthermore, the BiofilmScanner achieved\nan F1-score of 85.28% while Mask-RCNN and DLv3+ obtained F1-scores of 77.67%\nand 75.18%, respectively.\n","authors":["Md Hafizur Rahman","Md Ali Azam","Md Abir Hossen","Shankarachary Ragi","Venkataramana Gadhamshetty"],"pdf_url":"https://arxiv.org/pdf/2302.09629v2.pdf","comment":"Submitted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2306.16177v3","updated":"2023-07-24T12:32:58Z","published":"2023-06-28T12:58:42Z","title":"Defining data science: a new field of inquiry","summary":"  Data science is not a science. It is a research paradigm. Its power, scope,\nand scale will surpass science, our most powerful research paradigm, to enable\nknowledge discovery and change our world. We have yet to understand and define\nit, vital to realizing its potential and managing its risks. Modern data\nscience is in its infancy. Emerging slowly since 1962 and rapidly since 2000,\nit is a fundamentally new field of inquiry, one of the most active, powerful,\nand rapidly evolving 21st century innovations. Due to its value, power, and\napplicability, it is emerging in over 40 disciplines, hundreds of research\nareas, and thousands of applications. Millions of data science publications\ncontain myriad definitions of data science and data science problem solving.\nDue to its infancy, many definitions are independent, application specific,\nmutually incomplete, redundant, or inconsistent, hence so is data science. This\nresearch addresses this data science multiple definitions challenge by\nproposing the development of coherent, unified definition based on a data\nscience reference framework using a data science journal for the data science\ncommunity to achieve such a definition. This paper provides candidate\ndefinitions for essential data science artifacts that are required to discuss\nsuch a definition. They are based on the classical research paradigm concept\nconsisting of a philosophy of data science, the data science problem solving\nparadigm, and the six component data science reference framework (axiology,\nontology, epistemology, methodology, methods, technology) that is a frequently\ncalled for unifying framework with which to define, unify, and evolve data\nscience. It presents challenges for defining data science, solution approaches,\ni.e., means for defining data science, and their requirements and benefits as\nthe basis of a comprehensive solution.\n","authors":["Michael L Brodie"],"pdf_url":"https://arxiv.org/pdf/2306.16177v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12865v3","updated":"2023-07-24T12:08:50Z","published":"2023-03-22T18:59:48Z","title":"NeRF-GAN Distillation for Efficient 3D-Aware Generation with\n  Convolutions","summary":"  Pose-conditioned convolutional generative models struggle with high-quality\n3D-consistent image generation from single-view datasets, due to their lack of\nsufficient 3D priors. Recently, the integration of Neural Radiance Fields\n(NeRFs) and generative models, such as Generative Adversarial Networks (GANs),\nhas transformed 3D-aware generation from single-view images. NeRF-GANs exploit\nthe strong inductive bias of neural 3D representations and volumetric rendering\nat the cost of higher computational complexity. This study aims at revisiting\npose-conditioned 2D GANs for efficient 3D-aware generation at inference time by\ndistilling 3D knowledge from pretrained NeRF-GANs. We propose a simple and\neffective method, based on re-using the well-disentangled latent space of a\npre-trained NeRF-GAN in a pose-conditioned convolutional network to directly\ngenerate 3D-consistent images corresponding to the underlying 3D\nrepresentations. Experiments on several datasets demonstrate that the proposed\nmethod obtains results comparable with volumetric rendering in terms of quality\nand 3D consistency while benefiting from the computational advantage of\nconvolutional networks. The code will be available at:\nhttps://github.com/mshahbazi72/NeRF-GAN-Distillation\n","authors":["Mohamad Shahbazi","Evangelos Ntavelis","Alessio Tonioni","Edo Collins","Danda Pani Paudel","Martin Danelljan","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2303.12865v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12716v1","updated":"2023-07-24T11:55:32Z","published":"2023-07-24T11:55:32Z","title":"Safety Performance of Neural Networks in the Presence of Covariate Shift","summary":"  Covariate shift may impact the operational safety performance of neural\nnetworks. A re-evaluation of the safety performance, however, requires\ncollecting new operational data and creating corresponding ground truth labels,\nwhich often is not possible during operation. We are therefore proposing to\nreshape the initial test set, as used for the safety performance evaluation\nprior to deployment, based on an approximation of the operational data. This\napproximation is obtained by observing and learning the distribution of\nactivation patterns of neurons in the network during operation. The reshaped\ntest set reflects the distribution of neuron activation values as observed\nduring operation, and may therefore be used for re-evaluating safety\nperformance in the presence of covariate shift. First, we derive conservative\nbounds on the values of neurons by applying finite binning and static dataflow\nanalysis. Second, we formulate a mixed integer linear programming (MILP)\nconstraint for constructing the minimum set of data points to be removed in the\ntest set, such that the difference between the discretized test and operational\ndistributions is bounded. We discuss potential benefits and limitations of this\nconstraint-based approach based on our initial experience with an implemented\nresearch prototype.\n","authors":["Chih-Hong Cheng","Harald Ruess","Konstantinos Theodorou"],"pdf_url":"https://arxiv.org/pdf/2307.12716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13871v2","updated":"2023-07-24T11:44:01Z","published":"2023-04-26T23:34:40Z","title":"Typical and atypical solutions in non-convex neural networks with\n  discrete and continuous weights","summary":"  We study the binary and continuous negative-margin perceptrons as simple\nnon-convex neural network models learning random rules and associations. We\nanalyze the geometry of the landscape of solutions in both models and find\nimportant similarities and differences. Both models exhibit subdominant\nminimizers which are extremely flat and wide. These minimizers coexist with a\nbackground of dominant solutions which are composed by an exponential number of\nalgorithmically inaccessible small clusters for the binary case (the frozen\n1-RSB phase) or a hierarchical structure of clusters of different sizes for the\nspherical case (the full RSB phase). In both cases, when a certain threshold in\nconstraint density is crossed, the local entropy of the wide flat minima\nbecomes non-monotonic, indicating a break-up of the space of robust solutions\ninto disconnected components. This has a strong impact on the behavior of\nalgorithms in binary models, which cannot access the remaining isolated\nclusters. For the spherical case the behaviour is different, since even beyond\nthe disappearance of the wide flat minima the remaining solutions are shown to\nalways be surrounded by a large number of other solutions at any distance, up\nto capacity. Indeed, we exhibit numerical evidence that algorithms seem to find\nsolutions up to the SAT/UNSAT transition, that we compute here using an 1RSB\napproximation. For both models, the generalization performance as a learning\ndevice is shown to be greatly improved by the existence of wide flat minimizers\neven when trained in the highly underconstrained regime of very negative\nmargins.\n","authors":["Carlo Baldassi","Enrico M. Malatesta","Gabriele Perugini","Riccardo Zecchina"],"pdf_url":"https://arxiv.org/pdf/2304.13871v2.pdf","comment":"34 pages, 13 figures"},{"id":"http://arxiv.org/abs/2210.17230v3","updated":"2023-07-24T11:43:26Z","published":"2022-10-31T11:15:48Z","title":"Lipschitz-regularized gradient flows and generative particle algorithms\n  for high-dimensional scarce data","summary":"  We build a new class of generative algorithms capable of efficiently learning\nan arbitrary target distribution from possibly scarce, high-dimensional data\nand subsequently generate new samples. These generative algorithms are\nparticle-based and are constructed as gradient flows of Lipschitz-regularized\nKullback-Leibler or other $f$-divergences, where data from a source\ndistribution can be stably transported as particles, towards the vicinity of\nthe target distribution. As a highlighted result in data integration, we\ndemonstrate that the proposed algorithms correctly transport gene expression\ndata points with dimension exceeding 54K, while the sample size is typically\nonly in the hundreds.\n","authors":["Hyemin Gu","Panagiota Birmpa","Yannis Pantazis","Luc Rey-Bellet","Markos A. Katsoulakis"],"pdf_url":"https://arxiv.org/pdf/2210.17230v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12703v1","updated":"2023-07-24T11:37:02Z","published":"2023-07-24T11:37:02Z","title":"Policy Gradient Optimal Correlation Search for Variance Reduction in\n  Monte Carlo simulation and Maximum Optimal Transport","summary":"  We propose a new algorithm for variance reduction when estimating $f(X_T)$\nwhere $X$ is the solution to some stochastic differential equation and $f$ is a\ntest function. The new estimator is $(f(X^1_T) + f(X^2_T))/2$, where $X^1$ and\n$X^2$ have same marginal law as $X$ but are pathwise correlated so that to\nreduce the variance. The optimal correlation function $\\rho$ is approximated by\na deep neural network and is calibrated along the trajectories of $(X^1, X^2)$\nby policy gradient and reinforcement learning techniques. Finding an optimal\ncoupling given marginal laws has links with maximum optimal transport.\n","authors":["Pierre Bras","Gilles Pagès"],"pdf_url":"https://arxiv.org/pdf/2307.12703v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2303.09340v3","updated":"2023-07-24T11:34:21Z","published":"2023-03-16T14:21:45Z","title":"Improving Automated Hemorrhage Detection in Sparse-view Computed\n  Tomography via Deep Convolutional Neural Network based Artifact Reduction","summary":"  Purpose: Sparse-view computed tomography (CT) is an effective way to reduce\ndose by lowering the total number of views acquired, albeit at the expense of\nimage quality, which, in turn, can impact the ability to detect diseases. We\nexplore deep learning-based artifact reduction in sparse-view cranial CT scans\nand its impact on automated hemorrhage detection. Methods: We trained a U-Net\nfor artefact reduction on simulated sparse-view cranial CT scans from 3000\npatients obtained from a public dataset and reconstructed with varying levels\nof sub-sampling. Additionally, we trained a convolutional neural network on\nfully sampled CT data from 17,545 patients for automated hemorrhage detection.\nWe evaluated the classification performance using the area under the receiver\noperator characteristic curves (AUC-ROCs) with corresponding 95% confidence\nintervals (CIs) and the DeLong test, along with confusion matrices. The\nperformance of the U-Net was compared to an analytical approach based on total\nvariation (TV). Results: The U-Net performed superior compared to unprocessed\nand TV-processed images with respect to image quality and automated hemorrhage\ndiagnosis. With U-Net post-processing, the number of views can be reduced from\n4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;\n0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256\nviews (0.967; 0.964-0.969) with a slight performance decrease (P<.001).\nConclusion: The results suggest that U-Net based artifact reduction\nsubstantially enhances automated hemorrhage detection in sparse-view cranial\nCTs. Our findings highlight that appropriate post-processing is crucial for\noptimal image quality and diagnostic accuracy while minimizing radiation dose.\n","authors":["Johannes Thalhammer","Manuel Schultheiss","Tina Dorosti","Tobias Lasser","Franz Pfeiffer","Daniela Pfeiffer","Florian Schaff"],"pdf_url":"https://arxiv.org/pdf/2303.09340v3.pdf","comment":"11 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.12698v1","updated":"2023-07-24T11:27:14Z","published":"2023-07-24T11:27:14Z","title":"MC-JEPA: A Joint-Embedding Predictive Architecture for Self-Supervised\n  Learning of Motion and Content Features","summary":"  Self-supervised learning of visual representations has been focusing on\nlearning content features, which do not capture object motion or location, and\nfocus on identifying and differentiating objects in images and videos. On the\nother hand, optical flow estimation is a task that does not involve\nunderstanding the content of the images on which it is estimated. We unify the\ntwo approaches and introduce MC-JEPA, a joint-embedding predictive architecture\nand self-supervised learning approach to jointly learn optical flow and content\nfeatures within a shared encoder, demonstrating that the two associated\nobjectives; the optical flow estimation objective and the self-supervised\nlearning objective; benefit from each other and thus learn content features\nthat incorporate motion information. The proposed approach achieves performance\non-par with existing unsupervised optical flow benchmarks, as well as with\ncommon self-supervised learning approaches on downstream tasks such as semantic\nsegmentation of images and videos.\n","authors":["Adrien Bardes","Jean Ponce","Yann LeCun"],"pdf_url":"https://arxiv.org/pdf/2307.12698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10763v3","updated":"2023-07-24T11:15:47Z","published":"2023-02-12T12:19:57Z","title":"Contrastive Learning and the Emergence of Attributes Associations","summary":"  In response to an object presentation, supervised learning schemes generally\nrespond with a parsimonious label. Upon a similar presentation we humans\nrespond again with a label, but are flooded, in addition, by a myriad of\nassociations. A significant portion of these consist of the presented object\nattributes. Contrastive learning is a semi-supervised learning scheme based on\nthe application of identity preserving transformations on the object input\nrepresentations. It is conjectured in this work that these same applied\ntransformations preserve, in addition to the identity of the presented object,\nalso the identity of its semantically meaningful attributes. The corollary of\nthis is that the output representations of such a contrastive learning scheme\ncontain valuable information not only for the classification of the presented\nobject, but also for the presence or absence decision of any attribute of\ninterest. Simulation results which demonstrate this idea and the feasibility of\nthis conjecture are presented.\n","authors":["Daniel N. Nissani"],"pdf_url":"https://arxiv.org/pdf/2302.10763v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2210.12583v2","updated":"2023-07-24T11:13:21Z","published":"2022-10-23T00:45:05Z","title":"Active Learning of Discrete-Time Dynamics for Uncertainty-Aware Model\n  Predictive Control","summary":"  Model-based control requires an accurate model of the system dynamics for\nprecisely and safely controlling the robot in complex and dynamic environments.\nMoreover, in the presence of variations in the operating conditions, the model\nshould be continuously refined to compensate for dynamics changes. In this\npaper, we present a self-supervised learning approach that actively models the\ndynamics of nonlinear robotic systems. We combine offline learning from past\nexperience and online learning from current robot interaction with the unknown\nenvironment. These two ingredients enable a highly sample-efficient and\nadaptive learning process, capable of accurately inferring model dynamics in\nreal-time even in operating regimes that greatly differ from the training\ndistribution. Moreover, we design an uncertainty-aware model predictive\ncontroller that is heuristically conditioned to the aleatoric (data)\nuncertainty of the learned dynamics. This controller actively chooses the\noptimal control actions that (i) optimize the control performance and (ii)\nimprove the efficiency of online learning sample collection. We demonstrate the\neffectiveness of our method through a series of challenging real-world\nexperiments using a quadrotor system. Our approach showcases high resilience\nand generalization capabilities by consistently adapting to unseen flight\nconditions, while it significantly outperforms classical and adaptive control\nbaselines.\n","authors":["Alessandro Saviolo","Jonathan Frey","Abhishek Rathod","Moritz Diehl","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2210.12583v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12689v1","updated":"2023-07-24T11:04:22Z","published":"2023-07-24T11:04:22Z","title":"Addressing the Impact of Localized Training Data in Graph Neural\n  Networks","summary":"  Graph Neural Networks (GNNs) have achieved notable success in learning from\ngraph-structured data, owing to their ability to capture intricate dependencies\nand relationships between nodes. They excel in various applications, including\nsemi-supervised node classification, link prediction, and graph generation.\nHowever, it is important to acknowledge that the majority of state-of-the-art\nGNN models are built upon the assumption of an in-distribution setting, which\nhinders their performance on real-world graphs with dynamic structures. In this\narticle, we aim to assess the impact of training GNNs on localized subsets of\nthe graph. Such restricted training data may lead to a model that performs well\nin the specific region it was trained on but fails to generalize and make\naccurate predictions for the entire graph. In the context of graph-based\nsemi-supervised learning (SSL), resource constraints often lead to scenarios\nwhere the dataset is large, but only a portion of it can be labeled, affecting\nthe model's performance. This limitation affects tasks like anomaly detection\nor spam detection when labeling processes are biased or influenced by human\nsubjectivity. To tackle the challenges posed by localized training data, we\napproach the problem as an out-of-distribution (OOD) data issue by by aligning\nthe distributions between the training data, which represents a small portion\nof labeled data, and the graph inference process that involves making\npredictions for the entire graph. We propose a regularization method to\nminimize distributional discrepancies between localized training data and graph\ninference, improving model performance on OOD data. Extensive tests on popular\nGNN models show significant performance improvement on three citation GNN\nbenchmark datasets. The regularization approach effectively enhances model\nadaptation and generalization, overcoming challenges posed by OOD data.\n","authors":["Singh Akansha"],"pdf_url":"https://arxiv.org/pdf/2307.12689v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.12679v1","updated":"2023-07-24T10:33:32Z","published":"2023-07-24T10:33:32Z","title":"An Estimator for the Sensitivity to Perturbations of Deep Neural\n  Networks","summary":"  For Deep Neural Networks (DNNs) to become useful in safety-critical\napplications, such as self-driving cars and disease diagnosis, they must be\nstable to perturbations in input and model parameters. Characterizing the\nsensitivity of a DNN to perturbations is necessary to determine minimal\nbit-width precision that may be used to safely represent the network. However,\nno general result exists that is capable of predicting the sensitivity of a\ngiven DNN to round-off error, noise, or other perturbations in input. This\npaper derives an estimator that can predict such quantities. The estimator is\nderived via inequalities and matrix norms, and the resulting quantity is\nroughly analogous to a condition number for the entire neural network. An\napproximation of the estimator is tested on two Convolutional Neural Networks,\nAlexNet and VGG-19, using the ImageNet dataset. For each of these networks, the\ntightness of the estimator is explored via random perturbations and adversarial\nattacks.\n","authors":["Naman Maheshwari","Nicholas Malaya","Scott Moe","Jaydeep P. Kulkarni","Sudhanva Gurumurthi"],"pdf_url":"https://arxiv.org/pdf/2307.12679v1.pdf","comment":"Actual work and paper concluded in January 2019"},{"id":"http://arxiv.org/abs/2307.12672v1","updated":"2023-07-24T10:20:14Z","published":"2023-07-24T10:20:14Z","title":"Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked\n  Image Modeling","summary":"  In dynamic Magnetic Resonance Imaging (MRI), k-space is typically\nundersampled due to limited scan time, resulting in aliasing artifacts in the\nimage domain. Hence, dynamic MR reconstruction requires not only modeling\nspatial frequency components in the x and y directions of k-space but also\nconsidering temporal redundancy. Most previous works rely on image-domain\nregularizers (priors) to conduct MR reconstruction. In contrast, we focus on\ninterpolating the undersampled k-space before obtaining images with Fourier\ntransform. In this work, we connect masked image modeling with k-space\ninterpolation and propose a novel Transformer-based k-space Global\nInterpolation Network, termed k-GIN. Our k-GIN learns global dependencies among\nlow- and high-frequency components of 2D+t k-space and uses it to interpolate\nunsampled data. Further, we propose a novel k-space Iterative Refinement Module\n(k-IRM) to enhance the high-frequency components learning. We evaluate our\napproach on 92 in-house 2D+t cardiac MR subjects and compare it to MR\nreconstruction methods with image-domain regularizers. Experiments show that\nour proposed k-space interpolation method quantitatively and qualitatively\noutperforms baseline methods. Importantly, the proposed approach achieves\nsubstantially higher robustness and generalizability in cases of\nhighly-undersampled MR data.\n","authors":["Jiazhen Pan","Suprosanna Shit","Özgün Turgut","Wenqi Huang","Hongwei Bran Li","Nil Stolt-Ansó","Thomas Küstner","Kerstin Hammernik","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2307.12672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12667v1","updated":"2023-07-24T10:14:51Z","published":"2023-07-24T10:14:51Z","title":"TransFusion: Generating Long, High Fidelity Time Series using Diffusion\n  Models with Transformers","summary":"  The generation of high-quality, long-sequenced time-series data is essential\ndue to its wide range of applications. In the past, standalone Recurrent and\nConvolutional Neural Network-based Generative Adversarial Networks (GAN) were\nused to synthesize time-series data. However, they are inadequate for\ngenerating long sequences of time-series data due to limitations in the\narchitecture. Furthermore, GANs are well known for their training instability\nand mode collapse problem. To address this, we propose TransFusion, a\ndiffusion, and transformers-based generative model to generate high-quality\nlong-sequence time-series data. We have stretched the sequence length to 384,\nand generated high-quality synthetic data. To the best of our knowledge, this\nis the first study that has been done with this long-sequence length. Also, we\nintroduce two evaluation metrics to evaluate the quality of the synthetic data\nas well as its predictive characteristics. We evaluate TransFusion with a wide\nvariety of visual and empirical metrics, and TransFusion outperforms the\nprevious state-of-the-art by a significant margin.\n","authors":["Md Fahim Sikder","Resmi Ramachandranpillai","Fredrik Heintz"],"pdf_url":"https://arxiv.org/pdf/2307.12667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12660v1","updated":"2023-07-24T10:04:27Z","published":"2023-07-24T10:04:27Z","title":"Online Continual Learning in Keyword Spotting for Low-Resource Devices\n  via Pooling High-Order Temporal Statistics","summary":"  Keyword Spotting (KWS) models on embedded devices should adapt fast to new\nuser-defined words without forgetting previous ones. Embedded devices have\nlimited storage and computational resources, thus, they cannot save samples or\nupdate large models. We consider the setup of embedded online continual\nlearning (EOCL), where KWS models with frozen backbone are trained to\nincrementally recognize new words from a non-repeated stream of samples, seen\none at a time. To this end, we propose Temporal Aware Pooling (TAP) which\nconstructs an enriched feature space computing high-order moments of speech\nfeatures extracted by a pre-trained backbone. Our method, TAP-SLDA, updates a\nGaussian model for each class on the enriched feature space to effectively use\naudio representations. In experimental analyses, TAP-SLDA outperforms\ncompetitors on several setups, backbones, and baselines, bringing a relative\naverage gain of 11.3% on the GSC dataset.\n","authors":["Umberto Michieli","Pablo Peso Parada","Mete Ozay"],"pdf_url":"https://arxiv.org/pdf/2307.12660v1.pdf","comment":"INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2306.12231v2","updated":"2023-07-24T09:36:05Z","published":"2023-06-21T12:44:52Z","title":"Predicting protein variants with equivariant graph neural networks","summary":"  Pre-trained models have been successful in many protein engineering tasks.\nMost notably, sequence-based models have achieved state-of-the-art performance\non protein fitness prediction while structure-based models have been used\nexperimentally to develop proteins with enhanced functions. However, there is a\nresearch gap in comparing structure- and sequence-based methods for predicting\nprotein variants that are better than the wildtype protein. This paper aims to\naddress this gap by conducting a comparative study between the abilities of\nequivariant graph neural networks (EGNNs) and sequence-based approaches to\nidentify promising amino-acid mutations. The results show that our proposed\nstructural approach achieves a competitive performance to sequence-based\nmethods while being trained on significantly fewer molecules. Additionally, we\nfind that combining assay labelled data with structure pre-trained models\nyields similar trends as with sequence pre-trained models.\n  Our code and trained models can be found at:\nhttps://github.com/semiluna/partIII-amino-acid-prediction.\n","authors":["Antonia Boca","Simon Mathis"],"pdf_url":"https://arxiv.org/pdf/2306.12231v2.pdf","comment":"4 pages, 2 figures, accepted to the 2023 ICML Workshop on\n  Computational Biology"},{"id":"http://arxiv.org/abs/2307.12644v1","updated":"2023-07-24T09:35:47Z","published":"2023-07-24T09:35:47Z","title":"Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation\n  of rPPG","summary":"  Remote Photoplethysmography (rPPG) is a technology that utilizes the light\nabsorption properties of hemoglobin, captured via camera, to analyze and\nmeasure blood volume pulse (BVP). By analyzing the measured BVP, various\nphysiological signals such as heart rate, stress levels, and blood pressure can\nbe derived, enabling applications such as the early prediction of\ncardiovascular diseases. rPPG is a rapidly evolving field as it allows the\nmeasurement of vital signals using camera-equipped devices without the need for\nadditional devices such as blood pressure monitors or pulse oximeters, and\nwithout the assistance of medical experts. Despite extensive efforts and\nadvances in this field, serious challenges remain, including issues related to\nskin color, camera characteristics, ambient lighting, and other sources of\nnoise, which degrade performance accuracy. We argue that fair and evaluable\nbenchmarking is urgently required to overcome these challenges and make any\nmeaningful progress from both academic and commercial perspectives. In most\nexisting work, models are trained, tested, and validated only on limited\ndatasets. Worse still, some studies lack available code or reproducibility,\nmaking it difficult to fairly evaluate and compare performance. Therefore, the\npurpose of this study is to provide a benchmarking framework to evaluate\nvarious rPPG techniques across a wide range of datasets for fair evaluation and\ncomparison, including both conventional non-deep neural network (non-DNN) and\ndeep neural network (DNN) methods. GitHub URL:\nhttps://github.com/remotebiosensing/rppg.\n","authors":["Dae Yeol Kim","Eunsu Goh","KwangKee Lee","JongEui Chae","JongHyeon Mun","Junyeong Na","Chae-bong Sohn","Do-Yup Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12644v1.pdf","comment":"19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2307.12639v1","updated":"2023-07-24T09:30:30Z","published":"2023-07-24T09:30:30Z","title":"Fake News Detection Through Graph-based Neural Networks: A Survey","summary":"  The popularity of online social networks has enabled rapid dissemination of\ninformation. People now can share and consume information much more rapidly\nthan ever before. However, low-quality and/or accidentally/deliberately fake\ninformation can also spread rapidly. This can lead to considerable and negative\nimpacts on society. Identifying, labelling and debunking online misinformation\nas early as possible has become an increasingly urgent problem. Many methods\nhave been proposed to detect fake news including many deep learning and\ngraph-based approaches. In recent years, graph-based methods have yielded\nstrong results, as they can closely model the social context and propagation\nprocess of online news. In this paper, we present a systematic review of fake\nnews detection studies based on graph-based and deep learning-based techniques.\nWe classify existing graph-based methods into knowledge-driven methods,\npropagation-based methods, and heterogeneous social context-based methods,\ndepending on how a graph structure is constructed to model news related\ninformation flows. We further discuss the challenges and open problems in\ngraph-based fake news detection and identify future research directions.\n","authors":["Shuzhi Gong","Richard O. Sinnott","Jianzhong Qi","Cecile Paris"],"pdf_url":"https://arxiv.org/pdf/2307.12639v1.pdf","comment":"18 pages, 3 tables, 7 figures"},{"id":"http://arxiv.org/abs/2304.03981v2","updated":"2023-07-24T09:24:04Z","published":"2023-04-08T10:47:41Z","title":"Uncertainty-inspired Open Set Learning for Retinal Anomaly\n  Identification","summary":"  Failure to recognize samples from the classes unseen during training is a\nmajor limitation of artificial intelligence in the real-world implementation\nfor recognition and classification of retinal anomalies. We established an\nuncertainty-inspired open-set (UIOS) model, which was trained with fundus\nimages of 9 retinal conditions. Besides assessing the probability of each\ncategory, UIOS also calculated an uncertainty score to express its confidence.\nOur UIOS model with thresholding strategy achieved an F1 score of 99.55%,\n97.01% and 91.91% for the internal testing set, external target categories\n(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1\nscore of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS\ncorrectly predicted high uncertainty scores, which would prompt the need for a\nmanual check in the datasets of non-target categories retinal diseases,\nlow-quality fundus images, and non-fundus images. UIOS provides a robust method\nfor real-world screening of retinal anomalies.\n","authors":["Meng Wang","Tian Lin","Lianyu Wang","Aidi Lin","Ke Zou","Xinxing Xu","Yi Zhou","Yuanyuan Peng","Qingquan Meng","Yiming Qian","Guoyao Deng","Zhiqun Wu","Junhong Chen","Jianhong Lin","Mingzhi Zhang","Weifang Zhu","Changqing Zhang","Daoqiang Zhang","Rick Siow Mong Goh","Yong Liu","Chi Pui Pang","Xinjian Chen","Haoyu Chen","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2304.03981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12636v1","updated":"2023-07-24T09:19:38Z","published":"2023-07-24T09:19:38Z","title":"Identifying drivers and mitigators for congestion and redispatch in the\n  German electric power system with explainable AI","summary":"  The transition to a sustainable energy supply challenges the operation of\nelectric power systems in manifold ways. Transmission grid loads increase as\nwind and solar power are often installed far away from the consumers. In\nextreme cases, system operators must intervene via countertrading or redispatch\nto ensure grid stability. In this article, we provide a data-driven analysis of\ncongestion in the German transmission grid. We develop an explainable machine\nlearning model to predict the volume of redispatch and countertrade on an\nhourly basis. The model reveals factors that drive or mitigate grid congestion\nand quantifies their impact. We show that, as expected, wind power generation\nis the main driver, but hydropower and cross-border electricity trading also\nplay an essential role. Solar power, on the other hand, has no mitigating\neffect. Our results suggest that a change to the market design would alleviate\ncongestion.\n","authors":["Maurizio Titz","Sebastian Pütz","Dirk Witthaut"],"pdf_url":"https://arxiv.org/pdf/2307.12636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.14430v3","updated":"2023-07-24T09:15:02Z","published":"2022-09-28T21:31:43Z","title":"Minimax Optimal Kernel Operator Learning via Multilevel Training","summary":"  Learning mappings between infinite-dimensional function spaces has achieved\nempirical success in many disciplines of machine learning, including generative\nmodeling, functional data analysis, causal inference, and multi-agent\nreinforcement learning. In this paper, we study the statistical limit of\nlearning a Hilbert-Schmidt operator between two infinite-dimensional Sobolev\nreproducing kernel Hilbert spaces. We establish the information-theoretic lower\nbound in terms of the Sobolev Hilbert-Schmidt norm and show that a\nregularization that learns the spectral components below the bias contour and\nignores the ones that are above the variance contour can achieve the optimal\nlearning rate. At the same time, the spectral components between the bias and\nvariance contours give us flexibility in designing computationally feasible\nmachine learning algorithms. Based on this observation, we develop a multilevel\nkernel operator learning algorithm that is optimal when learning linear\noperators between infinite-dimensional function spaces.\n","authors":["Jikai Jin","Yiping Lu","Jose Blanchet","Lexing Ying"],"pdf_url":"https://arxiv.org/pdf/2209.14430v3.pdf","comment":"ICLR 2023 spotlight"},{"id":"http://arxiv.org/abs/2307.12625v1","updated":"2023-07-24T08:56:25Z","published":"2023-07-24T08:56:25Z","title":"De-confounding Representation Learning for Counterfactual Inference on\n  Continuous Treatment via Generative Adversarial Network","summary":"  Counterfactual inference for continuous rather than binary treatment\nvariables is more common in real-world causal inference tasks. While there are\nalready some sample reweighting methods based on Marginal Structural Model for\neliminating the confounding bias, they generally focus on removing the\ntreatment's linear dependence on confounders and rely on the accuracy of the\nassumed parametric models, which are usually unverifiable. In this paper, we\npropose a de-confounding representation learning (DRL) framework for\ncounterfactual outcome estimation of continuous treatment by generating the\nrepresentations of covariates disentangled with the treatment variables. The\nDRL is a non-parametric model that eliminates both linear and nonlinear\ndependence between treatment and covariates. Specifically, we train the\ncorrelations between the de-confounded representations and the treatment\nvariables against the correlations between the covariate representations and\nthe treatment variables to eliminate confounding bias. Further, a\ncounterfactual inference network is embedded into the framework to make the\nlearned representations serve both de-confounding and trusted inference.\nExtensive experiments on synthetic datasets show that the DRL model performs\nsuperiorly in learning de-confounding representations and outperforms\nstate-of-the-art counterfactual inference models for continuous treatment\nvariables. In addition, we apply the DRL model to a real-world medical dataset\nMIMIC and demonstrate a detailed causal relationship between red cell width\ndistribution and mortality.\n","authors":["Yonghe Zhao","Qiang Huang","Haolong Zeng","Yun Pen","Huiyan Sun"],"pdf_url":"https://arxiv.org/pdf/2307.12625v1.pdf","comment":"15 pages,4 figures"},{"id":"http://arxiv.org/abs/2307.12617v1","updated":"2023-07-24T08:46:12Z","published":"2023-07-24T08:46:12Z","title":"Predicting Ordinary Differential Equations with Transformers","summary":"  We develop a transformer-based sequence-to-sequence model that recovers\nscalar ordinary differential equations (ODEs) in symbolic form from irregularly\nsampled and noisy observations of a single solution trajectory. We demonstrate\nin extensive empirical evaluations that our model performs better or on par\nwith existing methods in terms of accurate recovery across various settings.\nMoreover, our method is efficiently scalable: after one-time pretraining on a\nlarge set of ODEs, we can infer the governing law of a new observed solution in\na few forward passes of the model.\n","authors":["Sören Becker","Michal Klein","Alexander Neitz","Giambattista Parascandolo","Niki Kilbertus"],"pdf_url":"https://arxiv.org/pdf/2307.12617v1.pdf","comment":"Published at ICML 2023"},{"id":"http://arxiv.org/abs/2307.09458v3","updated":"2023-07-24T08:32:40Z","published":"2023-07-18T17:39:04Z","title":"Does Circuit Analysis Interpretability Scale? Evidence from Multiple\n  Choice Capabilities in Chinchilla","summary":"  \\emph{Circuit analysis} is a promising technique for understanding the\ninternal mechanisms of language models. However, existing analyses are done in\nsmall models far from the state of the art. To address this, we present a case\nstudy of circuit analysis in the 70B Chinchilla model, aiming to test the\nscalability of circuit analysis. In particular, we study multiple-choice\nquestion answering, and investigate Chinchilla's capability to identify the\ncorrect answer \\emph{label} given knowledge of the correct answer \\emph{text}.\nWe find that the existing techniques of logit attribution, attention pattern\nvisualization, and activation patching naturally scale to Chinchilla, allowing\nus to identify and categorize a small set of `output nodes' (attention heads\nand MLPs).\n  We further study the `correct letter' category of attention heads aiming to\nunderstand the semantics of their features, with mixed results. For normal\nmultiple-choice question answers, we significantly compress the query, key and\nvalue subspaces of the head without loss of performance when operating on the\nanswer labels for multiple-choice questions, and we show that the query and key\nsubspaces represent an `Nth item in an enumeration' feature to at least some\nextent. However, when we attempt to use this explanation to understand the\nheads' behaviour on a more general distribution including randomized answer\nlabels, we find that it is only a partial explanation, suggesting there is more\nto learn about the operation of `correct letter' heads on multiple choice\nquestion answering.\n","authors":["Tom Lieberum","Matthew Rahtz","János Kramár","Neel Nanda","Geoffrey Irving","Rohin Shah","Vladimir Mikulik"],"pdf_url":"https://arxiv.org/pdf/2307.09458v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12607v1","updated":"2023-07-24T08:32:27Z","published":"2023-07-24T08:32:27Z","title":"ExWarp: Extrapolation and Warping-based Temporal Supersampling for\n  High-frequency Displays","summary":"  High-frequency displays are gaining immense popularity because of their\nincreasing use in video games and virtual reality applications. However, the\nissue is that the underlying GPUs cannot continuously generate frames at this\nhigh rate -- this results in a less smooth and responsive experience.\nFurthermore, if the frame rate is not synchronized with the refresh rate, the\nuser may experience screen tearing and stuttering. Previous works propose\nincreasing the frame rate to provide a smooth experience on modern displays by\npredicting new frames based on past or future frames. Interpolation and\nextrapolation are two widely used algorithms that predict new frames.\nInterpolation requires waiting for the future frame to make a prediction, which\nadds additional latency. On the other hand, extrapolation provides a better\nquality of experience because it relies solely on past frames -- it does not\nincur any additional latency. The simplest method to extrapolate a frame is to\nwarp the previous frame using motion vectors; however, the warped frame may\ncontain improperly rendered visual artifacts due to dynamic objects -- this\nmakes it very challenging to design such a scheme. Past work has used DNNs to\nget good accuracy, however, these approaches are slow. This paper proposes\nExwarp -- an approach based on reinforcement learning (RL) to intelligently\nchoose between the slower DNN-based extrapolation and faster warping-based\nmethods to increase the frame rate by 4x with an almost negligible reduction in\nthe perceived image quality.\n","authors":["Akanksha Dixit","Yashashwee Chakrabarty","Smruti R. Sarangi"],"pdf_url":"https://arxiv.org/pdf/2307.12607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12601v1","updated":"2023-07-24T08:21:13Z","published":"2023-07-24T08:21:13Z","title":"Concept backpropagation: An Explainable AI approach for visualising\n  learned concepts in neural network models","summary":"  Neural network models are widely used in a variety of domains, often as\nblack-box solutions, since they are not directly interpretable for humans. The\nfield of explainable artificial intelligence aims at developing explanation\nmethods to address this challenge, and several approaches have been developed\nover the recent years, including methods for investigating what type of\nknowledge these models internalise during the training process. Among these,\nthe method of concept detection, investigates which \\emph{concepts} neural\nnetwork models learn to represent in order to complete their tasks. In this\nwork, we present an extension to the method of concept detection, named\n\\emph{concept backpropagation}, which provides a way of analysing how the\ninformation representing a given concept is internalised in a given neural\nnetwork model. In this approach, the model input is perturbed in a manner\nguided by a trained concept probe for the described model, such that the\nconcept of interest is maximised. This allows for the visualisation of the\ndetected concept directly in the input space of the model, which in turn makes\nit possible to see what information the model depends on for representing the\ndescribed concept. We present results for this method applied to a various set\nof input modalities, and discuss how our proposed method can be used to\nvisualise what information trained concept probes use, and the degree as to\nwhich the representation of the probed concept is entangled within the neural\nnetwork model itself.\n","authors":["Patrik Hammersborg","Inga Strümke"],"pdf_url":"https://arxiv.org/pdf/2307.12601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12594v1","updated":"2023-07-24T08:11:59Z","published":"2023-07-24T08:11:59Z","title":"Optimized data collection and analysis process for studying\n  solar-thermal desalination by machine learning","summary":"  An effective interdisciplinary study between machine learning and\nsolar-thermal desalination requires a sufficiently large and well-analyzed\nexperimental datasets. This study develops a modified dataset collection and\nanalysis process for studying solar-thermal desalination by machine learning.\nBased on the optimized water condensation and collection process, the proposed\nexperimental method collects over one thousand datasets, which is ten times\nmore than the average number of datasets in previous works, by accelerating\ndata collection and reducing the time by 83.3%. On the other hand, the effects\nof dataset features are investigated by using three different algorithms,\nincluding artificial neural networks, multiple linear regressions, and random\nforests. The investigation focuses on the effects of dataset size and range on\nprediction accuracy, factor importance ranking, and the model's generalization\nability. The results demonstrate that a larger dataset can significantly\nimprove prediction accuracy when using artificial neural networks and random\nforests. Additionally, the study highlights the significant impact of dataset\nsize and range on ranking the importance of influence factors. Furthermore, the\nstudy reveals that the extrapolation data range significantly affects the\nextrapolation accuracy of artificial neural networks. Based on the results,\nmassive dataset collection and analysis of dataset feature effects are\nimportant steps in an effective and consistent machine learning process flow\nfor solar-thermal desalination, which can promote machine learning as a more\ngeneral tool in the field of solar-thermal desalination.\n","authors":["Guilong Peng","Senshan Sun","Yangjun Qin","Zhenwei Xu","Juxin Du","Swellam W. sharshir","A. W. Kandel","A. E. Kabeel","Nuo Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07515v2","updated":"2023-07-24T08:10:52Z","published":"2023-04-15T09:39:52Z","title":"S3M: Scalable Statistical Shape Modeling through Unsupervised\n  Correspondences","summary":"  Statistical shape models (SSMs) are an established way to represent the\nanatomy of a population with various clinically relevant applications. However,\nthey typically require domain expertise, and labor-intensive landmark\nannotations to construct. We address these shortcomings by proposing an\nunsupervised method that leverages deep geometric features and functional\ncorrespondences to simultaneously learn local and global shape structures\nacross population anatomies. Our pipeline significantly improves unsupervised\ncorrespondence estimation for SSMs compared to baseline methods, even on highly\nirregular surface topologies. We demonstrate this for two different anatomical\nstructures: the thyroid and a multi-chamber heart dataset. Furthermore, our\nmethod is robust enough to learn from noisy neural network predictions,\npotentially enabling scaling SSMs to larger patient populations without manual\nsegmentation annotation.\n","authors":["Lennart Bastian","Alexander Baumann","Emily Hoppe","Vincent Bürgin","Ha Young Kim","Mahdi Saleh","Benjamin Busam","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2304.07515v2.pdf","comment":"Accepted at MICCAI 2023. 13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.12586v1","updated":"2023-07-24T07:58:18Z","published":"2023-07-24T07:58:18Z","title":"InVAErt networks: a data-driven framework for emulation, inference and\n  identifiability analysis","summary":"  Use of generative models and deep learning for physics-based systems is\ncurrently dominated by the task of emulation. However, the remarkable\nflexibility offered by data-driven architectures would suggest to extend this\nrepresentation to other aspects of system synthesis including model inversion\nand identifiability. We introduce inVAErt (pronounced \\emph{invert}) networks,\na comprehensive framework for data-driven analysis and synthesis of parametric\nphysical systems which uses a deterministic encoder and decoder to represent\nthe forward and inverse solution maps, normalizing flow to capture the\nprobabilistic distribution of system outputs, and a variational encoder\ndesigned to learn a compact latent representation for the lack of bijectivity\nbetween inputs and outputs. We formally investigate the selection of penalty\ncoefficients in the loss function and strategies for latent space sampling,\nsince we find that these significantly affect both training and testing\nperformance. We validate our framework through extensive numerical examples,\nincluding simple linear, nonlinear, and periodic maps, dynamical systems, and\nspatio-temporal PDEs.\n","authors":["Guoxiang Grayson Tong","Carlos A. Sing Long","Daniele E. Schiavazzi"],"pdf_url":"https://arxiv.org/pdf/2307.12586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09087v3","updated":"2023-07-24T07:55:19Z","published":"2023-06-15T12:33:39Z","title":"Deep learning based Meta-modeling for Multi-objective Technology\n  Optimization of Electrical Machines","summary":"  Optimization of rotating electrical machines is both time- and\ncomputationally expensive. Because of the different parametrization, design\noptimization is commonly executed separately for each machine technology. In\nthis paper, we present the application of a variational auto-encoder (VAE) to\noptimize two different machine technologies simultaneously, namely an\nasynchronous machine and a permanent magnet synchronous machine. After\ntraining, we employ a deep neural network and a decoder as meta-models to\npredict global key performance indicators (KPIs) and generate associated new\ndesigns, respectively, through unified latent space in the optimization loop.\nNumerical results demonstrate concurrent parametric multi-objective technology\noptimization in the high-dimensional design space. The VAE-based approach is\nquantitatively compared to a classical deep learning-based direct approach for\nKPIs prediction.\n","authors":["Vivek Parekh","Dominik Flore","Sebastian Schöps"],"pdf_url":"https://arxiv.org/pdf/2306.09087v3.pdf","comment":"12 pages, 15 figures"},{"id":"http://arxiv.org/abs/2307.12576v1","updated":"2023-07-24T07:47:21Z","published":"2023-07-24T07:47:21Z","title":"Self-refining of Pseudo Labels for Music Source Separation with Noisy\n  Labeled Data","summary":"  Music source separation (MSS) faces challenges due to the limited\navailability of correctly-labeled individual instrument tracks. With the push\nto acquire larger datasets to improve MSS performance, the inevitability of\nencountering mislabeled individual instrument tracks becomes a significant\nchallenge to address. This paper introduces an automated technique for refining\nthe labels in a partially mislabeled dataset. Our proposed self-refining\ntechnique, employed with a noisy-labeled dataset, results in only a 1% accuracy\ndegradation in multi-label instrument recognition compared to a classifier\ntrained on a clean-labeled dataset. The study demonstrates the importance of\nrefining noisy-labeled data in MSS model training and shows that utilizing the\nrefined dataset leads to comparable results derived from a clean-labeled\ndataset. Notably, upon only access to a noisy dataset, MSS models trained on a\nself-refined dataset even outperform those trained on a dataset refined with a\nclassifier trained on clean labels.\n","authors":["Junghyun Koo","Yunkee Chae","Chang-Bin Jeon","Kyogu Lee"],"pdf_url":"https://arxiv.org/pdf/2307.12576v1.pdf","comment":"24th International Society for Music Information Retrieval Conference\n  (ISMIR 2023)"},{"id":"http://arxiv.org/abs/2306.16264v2","updated":"2023-07-24T07:30:53Z","published":"2023-06-28T14:46:55Z","title":"Deep Unfolded Simulated Bifurcation for Massive MIMO Signal Detection","summary":"  Multiple-input multiple-output (MIMO) is a key ingredient of next-generation\nwireless communications. Recently, various MIMO signal detectors based on deep\nlearning techniques and quantum(-inspired) algorithms have been proposed to\nimprove the detection performance compared with conventional detectors. This\npaper focuses on the simulated bifurcation (SB) algorithm, a quantum-inspired\nalgorithm. This paper proposes two techniques to improve its detection\nperformance. The first is modifying the algorithm inspired by the\nLevenberg-Marquardt algorithm to eliminate local minima of maximum likelihood\ndetection. The second is the use of deep unfolding, a deep learning technique\nto train the internal parameters of an iterative algorithm. We propose a\ndeep-unfolded SB by making the update rule of SB differentiable. The numerical\nresults show that these proposed detectors significantly improve the signal\ndetection performance in massive MIMO systems.\n","authors":["Satoshi Takabe"],"pdf_url":"https://arxiv.org/pdf/2306.16264v2.pdf","comment":"5pages, 4 figures; codes are available at\n  https://github.com/s-takabe/unfolded_simbif"},{"id":"http://arxiv.org/abs/2307.12564v1","updated":"2023-07-24T07:17:33Z","published":"2023-07-24T07:17:33Z","title":"Towards Generalising Neural Topical Representations","summary":"  Topic models have evolved from conventional Bayesian probabilistic models to\nNeural Topic Models (NTMs) over the last two decays. Although NTMs have\nachieved promising performance when trained and tested on a specific corpus,\ntheir generalisation ability across corpora is rarely studied. In practice, we\noften expect that an NTM trained on a source corpus can still produce quality\ntopical representation for documents in a different target corpus without\nretraining. In this work, we aim to improve NTMs further so that their benefits\ngeneralise reliably across corpora and tasks. To do so, we propose to model\nsimilar documents by minimising their semantical distance when training NTMs.\nSpecifically, similar documents are created by data augmentation during\ntraining; The semantical distance between documents is measured by the\nHierarchical Topic Transport Distance (HOTT), which computes the Optimal\nTransport (OT) distance between the topical representations. Our framework can\nbe readily applied to most NTMs as a plug-and-play module. Extensive\nexperiments show that our framework significantly improves the generalisation\nability regarding neural topical representation across corpora.\n","authors":["Xiaohao Yang","He Zhao","Dinh Phung","Lan Du"],"pdf_url":"https://arxiv.org/pdf/2307.12564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09251v2","updated":"2023-07-24T07:08:59Z","published":"2022-11-16T22:50:40Z","title":"Learning-Augmented B-Trees","summary":"  We study learning-augmented binary search trees (BSTs) and B-Trees via Treaps\nwith composite priorities. The result is a simple search tree where the depth\nof each item is determined by its predicted weight $w_x$. To achieve the\nresult, each item $x$ has its composite priority\n$-\\lfloor\\log\\log(1/w_x)\\rfloor + U(0, 1)$ where $U(0, 1)$ is the uniform\nrandom variable. This generalizes the recent learning-augmented BSTs\n[Lin-Luo-Woodruff ICML`22], which only work for Zipfian distributions, to\narbitrary inputs and predictions. It also gives the first B-Tree data structure\nthat can provably take advantage of localities in the access sequence via\nonline self-reorganization. The data structure is robust to prediction errors\nand handles insertions, deletions, as well as prediction updates.\n","authors":["Xinyuan Cao","Jingbang Chen","Li Chen","Chris Lambert","Richard Peng","Daniel Sleator"],"pdf_url":"https://arxiv.org/pdf/2211.09251v2.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2307.10617v3","updated":"2023-07-24T07:03:01Z","published":"2023-07-20T06:35:43Z","title":"Unmasking Falsehoods in Reviews: An Exploration of NLP Techniques","summary":"  In the contemporary digital landscape, online reviews have become an\nindispensable tool for promoting products and services across various\nbusinesses. Marketers, advertisers, and online businesses have found incentives\nto create deceptive positive reviews for their products and negative reviews\nfor their competitors' offerings. As a result, the writing of deceptive reviews\nhas become an unavoidable practice for businesses seeking to promote themselves\nor undermine their rivals. Detecting such deceptive reviews has become an\nintense and ongoing area of research. This research paper proposes a machine\nlearning model to identify deceptive reviews, with a particular focus on\nrestaurants. This study delves into the performance of numerous experiments\nconducted on a dataset of restaurant reviews known as the Deceptive Opinion\nSpam Corpus. To accomplish this, an n-gram model and max features are developed\nto effectively identify deceptive content, particularly focusing on fake\nreviews. A benchmark study is undertaken to explore the performance of two\ndifferent feature extraction techniques, which are then coupled with five\ndistinct machine learning classification algorithms. The experimental results\nreveal that the passive aggressive classifier stands out among the various\nalgorithms, showcasing the highest accuracy not only in text classification but\nalso in identifying fake reviews. Moreover, the research delves into data\naugmentation and implements various deep learning techniques to further enhance\nthe process of detecting deceptive reviews. The findings shed light on the\nefficacy of the proposed machine learning approach and offer valuable insights\ninto dealing with deceptive reviews in the realm of online businesses.\n","authors":["Anusuya Baby Hari Krishnan"],"pdf_url":"https://arxiv.org/pdf/2307.10617v3.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.12555v1","updated":"2023-07-24T06:41:59Z","published":"2023-07-24T06:41:59Z","title":"Homophily-Driven Sanitation View for Robust Graph Contrastive Learning","summary":"  We investigate adversarial robustness of unsupervised Graph Contrastive\nLearning (GCL) against structural attacks. First, we provide a comprehensive\nempirical and theoretical analysis of existing attacks, revealing how and why\nthey downgrade the performance of GCL. Inspired by our analytic results, we\npresent a robust GCL framework that integrates a homophily-driven sanitation\nview, which can be learned jointly with contrastive learning. A key challenge\nthis poses, however, is the non-differentiable nature of the sanitation\nobjective. To address this challenge, we propose a series of techniques to\nenable gradient-based end-to-end robust GCL. Moreover, we develop a fully\nunsupervised hyperparameter tuning method which, unlike prior approaches, does\nnot require knowledge of node labels. We conduct extensive experiments to\nevaluate the performance of our proposed model, GCHS (Graph Contrastive\nLearning with Homophily-driven Sanitation View), against two state of the art\nstructural attacks on GCL. Our results demonstrate that GCHS consistently\noutperforms all state of the art baselines in terms of the quality of generated\nnode embeddings as well as performance on two important downstream tasks.\n","authors":["Yulin Zhu","Xing Ai","Yevgeniy Vorobeychik","Kai Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.12555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12551v1","updated":"2023-07-24T06:38:10Z","published":"2023-07-24T06:38:10Z","title":"Continuation Path Learning for Homotopy Optimization","summary":"  Homotopy optimization is a traditional method to deal with a complicated\noptimization problem by solving a sequence of easy-to-hard surrogate\nsubproblems. However, this method can be very sensitive to the continuation\nschedule design and might lead to a suboptimal solution to the original\nproblem. In addition, the intermediate solutions, often ignored by classic\nhomotopy optimization, could be useful for many real-world applications. In\nthis work, we propose a novel model-based approach to learn the whole\ncontinuation path for homotopy optimization, which contains infinite\nintermediate solutions for any surrogate subproblems. Rather than the classic\nunidirectional easy-to-hard optimization, our method can simultaneously\noptimize the original problem and all surrogate subproblems in a collaborative\nmanner. The proposed model also supports real-time generation of any\nintermediate solution, which could be desirable for many applications.\nExperimental studies on different problems show that our proposed method can\nsignificantly improve the performance of homotopy optimization and provide\nextra helpful information to support better decision-making.\n","authors":["Xi Lin","Zhiyuan Yang","Xiaoyuan Zhang","Qingfu Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12551v1.pdf","comment":"Accepted by the 40th International Conference on Machine Learning\n  (ICML 2023)"},{"id":"http://arxiv.org/abs/2304.12438v2","updated":"2023-07-24T06:19:17Z","published":"2023-04-24T20:24:07Z","title":"Stochastic MPC for energy hubs using data driven demand forecasting","summary":"  Energy hubs convert and distribute energy resources by combining different\nenergy inputs through multiple conversion and storage components. The optimal\noperation of the energy hub exploits its flexibility to increase the energy\nefficiency and reduce the operational costs. However, uncertainties in the\ndemand present challenges to energy hub optimization. In this paper, we propose\na stochastic MPC controller to minimize energy costs using chance constraints\nfor the uncertain electricity and thermal demands. Historical data is used to\nbuild a demand prediction model based on Gaussian processes to generate a\nforecast of the future electricity and heat demands. The stochastic\noptimization problem is solved via the Scenario Approach by sampling multi-step\ndemand trajectories from the derived prediction model. The performance of the\nproposed predictor and of the stochastic controller is verified on a simulated\nenergy hub model and demand data from a real building.\n","authors":["Varsha Behrunani","Francesco Micheli","Jonas Mehr","Philipp Heer","John Lygeros"],"pdf_url":"https://arxiv.org/pdf/2304.12438v2.pdf","comment":"6 pages, 5 figures. Submitted to IFAC World Congress 2023"},{"id":"http://arxiv.org/abs/2211.09710v3","updated":"2023-07-24T05:39:27Z","published":"2022-11-17T17:45:59Z","title":"Style Classification of Rabbinic Literature for Detection of Lost\n  Midrash Tanhuma Material","summary":"  Midrash collections are complex rabbinic works that consist of text in\nmultiple languages, which evolved through long processes of unstable oral and\nwritten transmission. Determining the origin of a given passage in such a\ncompilation is not always straightforward and is often a matter of dispute\namong scholars, yet it is essential for scholars' understanding of the passage\nand its relationship to other texts in the rabbinic corpus. To help solve this\nproblem, we propose a system for classification of rabbinic literature based on\nits style, leveraging recent advances in natural language processing for Hebrew\ntexts. Additionally, we demonstrate how this method can be applied to uncover\nlost material from a specific midrash genre, Tan\\d{h}uma-Yelammedenu, that has\nbeen preserved in later anthologies.\n","authors":["Shlomo Tannor","Nachum Dershowitz","Moshe Lavee"],"pdf_url":"https://arxiv.org/pdf/2211.09710v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12532v1","updated":"2023-07-24T05:36:19Z","published":"2023-07-24T05:36:19Z","title":"On the Connection between Pre-training Data Diversity and Fine-tuning\n  Robustness","summary":"  Pre-training has been widely adopted in deep learning to improve model\nperformance, especially when the training data for a target task is limited. In\nour work, we seek to understand the implications of this training strategy on\nthe generalization properties of downstream models. More specifically, we ask\nthe following question: how do properties of the pre-training distribution\naffect the robustness of a fine-tuned model? The properties we explore include\nthe label space, label semantics, image diversity, data domains, and data\nquantity of the pre-training distribution. We find that the primary factor\ninfluencing downstream effective robustness (Taori et al., 2020) is data\nquantity, while other factors have limited significance. For example, reducing\nthe number of ImageNet pre-training classes by 4x while increasing the number\nof images per class by 4x (that is, keeping total data quantity fixed) does not\nimpact the robustness of fine-tuned models. We demonstrate our findings on\npre-training distributions drawn from various natural and synthetic data\nsources, primarily using the iWildCam-WILDS distribution shift as a test for\ndownstream robustness.\n","authors":["Vivek Ramanujan","Thao Nguyen","Sewoong Oh","Ludwig Schmidt","Ali Farhadi"],"pdf_url":"https://arxiv.org/pdf/2307.12532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12526v1","updated":"2023-07-24T04:56:23Z","published":"2023-07-24T04:56:23Z","title":"Rethinking Medical Report Generation: Disease Revealing Enhancement with\n  Knowledge Graph","summary":"  Knowledge Graph (KG) plays a crucial role in Medical Report Generation (MRG)\nbecause it reveals the relations among diseases and thus can be utilized to\nguide the generation process. However, constructing a comprehensive KG is\nlabor-intensive and its applications on the MRG process are under-explored. In\nthis study, we establish a complete KG on chest X-ray imaging that includes 137\ntypes of diseases and abnormalities. Based on this KG, we find that the current\nMRG data sets exhibit a long-tailed problem in disease distribution. To\nmitigate this problem, we introduce a novel augmentation strategy that enhances\nthe representation of disease types in the tail-end of the distribution. We\nfurther design a two-stage MRG approach, where a classifier is first trained to\ndetect whether the input images exhibit any abnormalities. The classified\nimages are then independently fed into two transformer-based generators,\nnamely, ``disease-specific generator\" and ``disease-free generator\" to generate\nthe corresponding reports. To enhance the clinical evaluation of whether the\ngenerated reports correctly describe the diseases appearing in the input image,\nwe propose diverse sensitivity (DS), a new metric that checks whether generated\ndiseases match ground truth and measures the diversity of all generated\ndiseases. Results show that the proposed two-stage generation framework and\naugmentation strategies improve DS by a considerable margin, indicating a\nnotable reduction in the long-tailed problem associated with under-represented\ndiseases.\n","authors":["Yixin Wang","Zihao Lin","Haoyu Dong"],"pdf_url":"https://arxiv.org/pdf/2307.12526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12524v1","updated":"2023-07-24T04:46:22Z","published":"2023-07-24T04:46:22Z","title":"Landslide Surface Displacement Prediction Based on VSXC-LSTM Algorithm","summary":"  Landslide is a natural disaster that can easily threaten local ecology,\npeople's lives and property. In this paper, we conduct modelling research on\nreal unidirectional surface displacement data of recent landslides in the\nresearch area and propose a time series prediction framework named\nVMD-SegSigmoid-XGBoost-ClusterLSTM (VSXC-LSTM) based on variational mode\ndecomposition, which can predict the landslide surface displacement more\naccurately. The model performs well on the test set. Except for the random item\nsubsequence that is hard to fit, the root mean square error (RMSE) and the mean\nabsolute percentage error (MAPE) of the trend item subsequence and the periodic\nitem subsequence are both less than 0.1, and the RMSE is as low as 0.006 for\nthe periodic item prediction module based on XGBoost\\footnote{Accepted in\nICANN2023}.\n","authors":["Menglin Kong","Ruichen Li","Fan Liu","Xingquan Li","Juan Cheng","Muzhou Hou","Cong Cao"],"pdf_url":"https://arxiv.org/pdf/2307.12524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12520v1","updated":"2023-07-24T04:29:43Z","published":"2023-07-24T04:29:43Z","title":"Lost In Translation: Generating Adversarial Examples Robust to\n  Round-Trip Translation","summary":"  Language Models today provide a high accuracy across a large number of\ndownstream tasks. However, they remain susceptible to adversarial attacks,\nparticularly against those where the adversarial examples maintain considerable\nsimilarity to the original text. Given the multilingual nature of text, the\neffectiveness of adversarial examples across translations and how machine\ntranslations can improve the robustness of adversarial examples remain largely\nunexplored. In this paper, we present a comprehensive study on the robustness\nof current text adversarial attacks to round-trip translation. We demonstrate\nthat 6 state-of-the-art text-based adversarial attacks do not maintain their\nefficacy after round-trip translation. Furthermore, we introduce an\nintervention-based solution to this problem, by integrating Machine Translation\ninto the process of adversarial example generation and demonstrating increased\nrobustness to round-trip translation. Our results indicate that finding\nadversarial examples robust to translation can help identify the insufficiency\nof language models that is common across languages, and motivate further\nresearch into multilingual adversarial attacks.\n","authors":["Neel Bhandari","Pin-Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12520v1.pdf","comment":"Published at International Conference on Acoustics, Speech, and\n  Signal Processing (ICASSP) 2023"},{"id":"http://arxiv.org/abs/2307.12519v1","updated":"2023-07-24T04:29:00Z","published":"2023-07-24T04:29:00Z","title":"DEPHN: Different Expression Parallel Heterogeneous Network using virtual\n  gradient optimization for Multi-task Learning","summary":"  Recommendation system algorithm based on multi-task learning (MTL) is the\nmajor method for Internet operators to understand users and predict their\nbehaviors in the multi-behavior scenario of platform. Task correlation is an\nimportant consideration of MTL goals, traditional models use shared-bottom\nmodels and gating experts to realize shared representation learning and\ninformation differentiation. However, The relationship between real-world tasks\nis often more complex than existing methods do not handle properly sharing\ninformation. In this paper, we propose an Different Expression Parallel\nHeterogeneous Network (DEPHN) to model multiple tasks simultaneously. DEPHN\nconstructs the experts at the bottom of the model by using different feature\ninteraction methods to improve the generalization ability of the shared\ninformation flow. In view of the model's differentiating ability for different\ntask information flows, DEPHN uses feature explicit mapping and virtual\ngradient coefficient for expert gating during the training process, and\nadaptively adjusts the learning intensity of the gated unit by considering the\ndifference of gating values and task correlation. Extensive experiments on\nartificial and real-world datasets demonstrate that our proposed method can\ncapture task correlation in complex situations and achieve better performance\nthan baseline models\\footnote{Accepted in IJCNN2023}.\n","authors":["Menglin Kong","Ri Su","Shaojie Zhao","Muzhou Hou"],"pdf_url":"https://arxiv.org/pdf/2307.12519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12518v1","updated":"2023-07-24T04:23:08Z","published":"2023-07-24T04:23:08Z","title":"FaFCNN: A General Disease Classification Framework Based on Feature\n  Fusion Neural Networks","summary":"  There are two fundamental problems in applying deep learning/machine learning\nmethods to disease classification tasks, one is the insufficient number and\npoor quality of training samples; another one is how to effectively fuse\nmultiple source features and thus train robust classification models. To\naddress these problems, inspired by the process of human learning knowledge, we\npropose the Feature-aware Fusion Correlation Neural Network (FaFCNN), which\nintroduces a feature-aware interaction module and a feature alignment module\nbased on domain adversarial learning. This is a general framework for disease\nclassification, and FaFCNN improves the way existing methods obtain sample\ncorrelation features. The experimental results show that training using\naugmented features obtained by pre-training gradient boosting decision tree\nyields more performance gains than random-forest based methods. On the\nlow-quality dataset with a large amount of missing data in our setup, FaFCNN\nobtains a consistently optimal performance compared to competitive baselines.\nIn addition, extensive experiments demonstrate the robustness of the proposed\nmethod and the effectiveness of each component of the model\\footnote{Accepted\nin IEEE SMC2023}.\n","authors":["Menglin Kong","Shaojie Zhao","Juan Cheng","Xingquan Li","Ri Su","Muzhou Hou","Cong Cao"],"pdf_url":"https://arxiv.org/pdf/2307.12518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12510v1","updated":"2023-07-24T03:52:11Z","published":"2023-07-24T03:52:11Z","title":"An Empirical Evaluation of Temporal Graph Benchmark","summary":"  In this paper, we conduct an empirical evaluation of Temporal Graph Benchmark\n(TGB) by extending our Dynamic Graph Library (DyGLib) to TGB. Compared with\nTGB, we include eleven popular dynamic graph learning methods for more\nexhaustive comparisons. Through the experiments, we find that (1) some issues\nneed to be addressed in the current version of TGB, including mismatched data\nstatistics, inaccurate evaluation metric computation, and so on; (2) different\nmodels depict varying performance across various datasets, which is in line\nwith previous observations; (3) the performance of some baselines can be\nsignificantly improved over the reported results in TGB when using DyGLib. This\nwork aims to ease the researchers' efforts in evaluating various dynamic graph\nlearning methods on TGB and attempts to offer results that can be directly\nreferenced in the follow-up research. All the used resources in this project\nare publicly available at https://github.com/yule-BUAA/DyGLib_TGB. This work is\nin progress, and feedback from the community is welcomed for improvements.\n","authors":["Le Yu"],"pdf_url":"https://arxiv.org/pdf/2307.12510v1.pdf","comment":"preprint, in progress"},{"id":"http://arxiv.org/abs/2304.03483v2","updated":"2023-07-24T03:28:34Z","published":"2023-04-07T05:29:59Z","title":"RED-PSM: Regularization by Denoising of Partially Separable Models for\n  Dynamic Imaging","summary":"  Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at\neach time instant using its undersampled measurements. In particular, in the\ncase of dynamic tomography, only a single projection at a single view angle may\nbe available at a time, making the problem severely ill-posed. In this work, we\npropose an approach, RED-PSM, which combines for the first time two powerful\ntechniques to address this challenging imaging problem. The first, are\npartially separable models, which have been used to efficiently introduce a\nlow-rank prior for the spatio-temporal object. The second is the recent\nRegularization by Denoising (RED), which provides a flexible framework to\nexploit the impressive performance of state-of-the-art image denoising\nalgorithms, for various inverse problems. We propose a partially separable\nobjective with RED and a computationally efficient and scalable optimization\nscheme with variable splitting and ADMM. Theoretical analysis proves the\nconvergence of our objective to a value corresponding to a stationary point\nsatisfying the first-order optimality conditions. Convergence is accelerated by\na particular projection-domain-based initialization. We demonstrate the\nperformance and computational improvements of our proposed RED-PSM with a\nlearned image denoiser by comparing it to a recent deep-prior-based method\nknown as TD-DIP. Although the main focus is on dynamic tomography, we also show\nthe performance advantages of RED-PSM in a cardiac dynamic MRI setting.\n","authors":["Berk Iskender","Marc L. Klasky","Yoram Bresler"],"pdf_url":"https://arxiv.org/pdf/2304.03483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12499v1","updated":"2023-07-24T03:10:02Z","published":"2023-07-24T03:10:02Z","title":"AdvDiff: Generating Unrestricted Adversarial Examples using Diffusion\n  Models","summary":"  Unrestricted adversarial attacks present a serious threat to deep learning\nmodels and adversarial defense techniques. They pose severe security problems\nfor deep learning applications because they can effectively bypass defense\nmechanisms. However, previous attack methods often utilize Generative\nAdversarial Networks (GANs), which are not theoretically provable and thus\ngenerate unrealistic examples by incorporating adversarial objectives,\nespecially for large-scale datasets like ImageNet. In this paper, we propose a\nnew method, called AdvDiff, to generate unrestricted adversarial examples with\ndiffusion models. We design two novel adversarial guidance techniques to\nconduct adversarial sampling in the reverse generation process of diffusion\nmodels. These two techniques are effective and stable to generate high-quality,\nrealistic adversarial examples by integrating gradients of the target\nclassifier interpretably. Experimental results on MNIST and ImageNet datasets\ndemonstrate that AdvDiff is effective to generate unrestricted adversarial\nexamples, which outperforms GAN-based methods in terms of attack performance\nand generation quality.\n","authors":["Xuelong Dai","Kaisheng Liang","Bin Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.12499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12496v1","updated":"2023-07-24T03:04:10Z","published":"2023-07-24T03:04:10Z","title":"A faster and simpler algorithm for learning shallow networks","summary":"  We revisit the well-studied problem of learning a linear combination of $k$\nReLU activations given labeled examples drawn from the standard $d$-dimensional\nGaussian measure. Chen et al. [CDG+23] recently gave the first algorithm for\nthis problem to run in $\\text{poly}(d,1/\\varepsilon)$ time when $k = O(1)$,\nwhere $\\varepsilon$ is the target error. More precisely, their algorithm runs\nin time $(d/\\varepsilon)^{\\mathrm{quasipoly}(k)}$ and learns over multiple\nstages. Here we show that a much simpler one-stage version of their algorithm\nsuffices, and moreover its runtime is only $(d/\\varepsilon)^{O(k^2)}$.\n","authors":["Sitan Chen","Shyam Narayanan"],"pdf_url":"https://arxiv.org/pdf/2307.12496v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2307.12491v1","updated":"2023-07-24T02:50:19Z","published":"2023-07-24T02:50:19Z","title":"Learning Universal and Robust 3D Molecular Representations with Graph\n  Convolutional Networks","summary":"  To learn accurate representations of molecules, it is essential to consider\nboth chemical and geometric features. To encode geometric information, many\ndescriptors have been proposed in constrained circumstances for specific types\nof molecules and do not have the properties to be ``robust\": 1. Invariant to\nrotations and translations; 2. Injective when embedding molecular structures.\nIn this work, we propose a universal and robust Directional Node Pair (DNP)\ndescriptor based on the graph representations of 3D molecules. Our DNP\ndescriptor is robust compared to previous ones and can be applied to multiple\nmolecular types. To combine the DNP descriptor and chemical features in\nmolecules, we construct the Robust Molecular Graph Convolutional Network\n(RoM-GCN) which is capable to take both node and edge features into\nconsideration when generating molecule representations. We evaluate our model\non protein and small molecule datasets. Our results validate the superiority of\nthe DNP descriptor in incorporating 3D geometric information of molecules.\nRoM-GCN outperforms all compared baselines.\n","authors":["Shuo Zhang","Yang Liu","Li Xie","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2307.12491v1.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2307.01482v2","updated":"2023-07-24T02:40:29Z","published":"2023-07-04T05:19:19Z","title":"Nexus sine qua non: Essentially Connected Networks for Traffic\n  Forecasting","summary":"  Spatial-temporal graph neural networks (STGNNs) have become the de facto\nmodels for learning spatiotemporal representations of traffic flow. However,\nmodern STGNNs often contain superfluous or obscure components, along with\ncomplex techniques, posing significant challenges in terms of complexity and\nscalability. Such concerns prompt us to rethink the design of neural\narchitectures and to identify the key challenges in traffic forecasting as\nspatial-temporal contextualization. Here, we present an essentially connected\nmodel based on an efficient message-passing backbone, powered by learnable node\nembedding, without any complex sequential techniques such as TCNs, RNNs, and\nTransformers. Intriguingly, empirical results demonstrate how a simple and\nelegant model with contextualization capability compares favorably w.r.t. the\nstate-of-the-art with elaborate structures, while being much more interpretable\nand computationally efficient for traffic forecasting. We anticipate that our\nfindings will open new horizons for further research to explore the possibility\nof creating simple but effective neural forecasting architectures.\n","authors":["Tong Nie","Guoyang Qin","Yunpeng Wang","Jian Sun"],"pdf_url":"https://arxiv.org/pdf/2307.01482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04893v2","updated":"2023-07-24T02:38:09Z","published":"2023-07-10T20:31:23Z","title":"Choosing Well Your Opponents: How to Guide the Synthesis of Programmatic\n  Strategies","summary":"  This paper introduces Local Learner (2L), an algorithm for providing a set of\nreference strategies to guide the search for programmatic strategies in\ntwo-player zero-sum games. Previous learning algorithms, such as Iterated Best\nResponse (IBR), Fictitious Play (FP), and Double-Oracle (DO), can be\ncomputationally expensive or miss important information for guiding search\nalgorithms. 2L actively selects a set of reference strategies to improve the\nsearch signal. We empirically demonstrate the advantages of our approach while\nguiding a local search algorithm for synthesizing strategies in three games,\nincluding MicroRTS, a challenging real-time strategy game. Results show that 2L\nlearns reference strategies that provide a stronger search signal than IBR, FP,\nand DO. We also simulate a tournament of MicroRTS, where a synthesizer using 2L\noutperformed the winners of the two latest MicroRTS competitions, which were\nprogrammatic strategies written by human programmers.\n","authors":["Rubens O. Moraes","David S. Aleixo","Lucas N. Ferreira","Levi H. S. Lelis"],"pdf_url":"https://arxiv.org/pdf/2307.04893v2.pdf","comment":"International Joint Conference on Artificial Intelligence (IJCAI)\n  2023"},{"id":"http://arxiv.org/abs/2307.12480v1","updated":"2023-07-24T02:28:50Z","published":"2023-07-24T02:28:50Z","title":"Learning Resource Allocation Policy: Vertex-GNN or Edge-GNN?","summary":"  Graph neural networks (GNNs) update the hidden representations of vertices\n(called Vertex-GNNs) or hidden representations of edges (called Edge-GNNs) by\nprocessing and pooling the information of neighboring vertices and edges and\ncombining to incorporate graph topology. When learning resource allocation\npolicies, GNNs cannot perform well if their expressive power are weak, i.e., if\nthey cannot differentiate all input features such as channel matrices. In this\npaper, we analyze the expressive power of the Vertex-GNNs and Edge-GNNs for\nlearning three representative wireless policies: link scheduling, power\ncontrol, and precoding policies. We find that the expressive power of the GNNs\ndepend on the linearity and output dimensions of the processing and combination\nfunctions. When linear processors are used, the Vertex-GNNs cannot\ndifferentiate all channel matrices due to the loss of channel information,\nwhile the Edge-GNNs can. When learning the precoding policy, even the\nVertex-GNNs with non-linear processors may not be with strong expressive\nability due to the dimension compression. We proceed to provide necessary\nconditions for the GNNs to well learn the precoding policy. Simulation results\nvalidate the analyses and show that the Edge-GNNs can achieve the same\nperformance as the Vertex-GNNs with much lower training and inference time.\n","authors":["Yao Peng","Jia Guo","Chenyang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.16392v2","updated":"2023-07-24T02:05:50Z","published":"2022-10-28T20:13:00Z","title":"Physics-aware Graph Neural Network for Accurate RNA 3D Structure\n  Prediction","summary":"  Biological functions of RNAs are determined by their three-dimensional (3D)\nstructures. Thus, given the limited number of experimentally determined RNA\nstructures, the prediction of RNA structures will facilitate elucidating RNA\nfunctions and RNA-targeted drug discovery, but remains a challenging task. In\nthis work, we propose a Graph Neural Network (GNN)-based scoring function\ntrained only with the atomic types and coordinates on limited solved RNA 3D\nstructures for distinguishing accurate structural models. The proposed\nPhysics-aware Multiplex Graph Neural Network (PaxNet) separately models the\nlocal and non-local interactions inspired by molecular mechanics. Furthermore,\nPaxNet contains an attention-based fusion module that learns the individual\ncontribution of each interaction type for the final prediction. We rigorously\nevaluate the performance of PaxNet on two benchmarks and compare it with\nseveral state-of-the-art baselines. The results show that PaxNet significantly\noutperforms all the baselines overall, and demonstrate the potential of PaxNet\nfor improving the 3D structure modeling of RNA and other macromolecules. Our\ncode is available at https://github.com/zetayue/Physics-aware-Multiplex-GNN.\n","authors":["Shuo Zhang","Yang Liu","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2210.16392v2.pdf","comment":"Accepted by the Machine Learning for Structural Biology Workshop\n  (MLSB) at the 36th Conference on Neural Information Processing Systems\n  (NeurIPS 2022)"},{"id":"http://arxiv.org/abs/2307.12472v1","updated":"2023-07-24T01:58:48Z","published":"2023-07-24T01:58:48Z","title":"Model-free generalized fiducial inference","summary":"  Motivated by the need for the development of safe and reliable methods for\nuncertainty quantification in machine learning, I propose and develop ideas for\na model-free statistical framework for imprecise probabilistic prediction\ninference. This framework facilitates uncertainty quantification in the form of\nprediction sets that offer finite sample control of type 1 errors, a property\nshared with conformal prediction sets, but this new approach also offers more\nversatile tools for imprecise probabilistic reasoning. Furthermore, I propose\nand consider the theoretical and empirical properties of a precise\nprobabilistic approximation to the model-free imprecise framework.\nApproximating a belief/plausibility measure pair by an [optimal in some sense]\nprobability measure in the credal set is a critical resolution needed for the\nbroader adoption of imprecise probabilistic approaches to inference in\nstatistical and machine learning communities. It is largely undetermined in the\nstatistical and machine learning literatures, more generally, how to properly\nquantify uncertainty in that there is no generally accepted standard of\naccountability of stated uncertainties. The research I present in this\nmanuscript is aimed at motivating a framework for statistical inference with\nreliability and accountability as the guiding principles.\n","authors":["Jonathan P Williams"],"pdf_url":"https://arxiv.org/pdf/2307.12472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12463v1","updated":"2023-07-24T00:53:46Z","published":"2023-07-24T00:53:46Z","title":"Rethinking Data Distillation: Do Not Overlook Calibration","summary":"  Neural networks trained on distilled data often produce over-confident output\nand require correction by calibration methods. Existing calibration methods\nsuch as temperature scaling and mixup work well for networks trained on\noriginal large-scale data. However, we find that these methods fail to\ncalibrate networks trained on data distilled from large source datasets. In\nthis paper, we show that distilled data lead to networks that are not\ncalibratable due to (i) a more concentrated distribution of the maximum logits\nand (ii) the loss of information that is semantically meaningful but unrelated\nto classification tasks. To address this problem, we propose Masked Temperature\nScaling (MTS) and Masked Distillation Training (MDT) which mitigate the\nlimitations of distilled data and achieve better calibration results while\nmaintaining the efficiency of dataset distillation.\n","authors":["Dongyao Zhu","Bowen Lei","Jie Zhang","Yanbo Fang","Ruqi Zhang","Yiqun Xie","Dongkuan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.12463v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12461v1","updated":"2023-07-24T00:16:50Z","published":"2023-07-24T00:16:50Z","title":"Rates of Approximation by ReLU Shallow Neural Networks","summary":"  Neural networks activated by the rectified linear unit (ReLU) play a central\nrole in the recent development of deep learning. The topic of approximating\nfunctions from H\\\"older spaces by these networks is crucial for understanding\nthe efficiency of the induced learning algorithms. Although the topic has been\nwell investigated in the setting of deep neural networks with many layers of\nhidden neurons, it is still open for shallow networks having only one hidden\nlayer. In this paper, we provide rates of uniform approximation by these\nnetworks. We show that ReLU shallow neural networks with $m$ hidden neurons can\nuniformly approximate functions from the H\\\"older space $W_\\infty^r([-1, 1]^d)$\nwith rates $O((\\log m)^{\\frac{1}{2} +d}m^{-\\frac{r}{d}\\frac{d+2}{d+4}})$ when\n$r<d/2 +2$. Such rates are very close to the optimal one $O(m^{-\\frac{r}{d}})$\nin the sense that $\\frac{d+2}{d+4}$ is close to $1$, when the dimension $d$ is\nlarge.\n","authors":["Tong Mao","Ding-Xuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.12461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13178v1","updated":"2023-07-24T23:57:29Z","published":"2023-07-24T23:57:29Z","title":"Evaluating the reliability of automatically generated pedestrian and\n  bicycle crash surrogates","summary":"  Vulnerable road users (VRUs), such as pedestrians and bicyclists, are at a\nhigher risk of being involved in crashes with motor vehicles, and crashes\ninvolving VRUs also are more likely to result in severe injuries or fatalities.\nSignalized intersections are a major safety concern for VRUs due to their\ncomplex and dynamic nature, highlighting the need to understand how these road\nusers interact with motor vehicles and deploy evidence-based countermeasures to\nimprove safety performance. Crashes involving VRUs are relatively infrequent,\nmaking it difficult to understand the underlying contributing factors. An\nalternative is to identify and use conflicts between VRUs and motorized\nvehicles as a surrogate for safety performance. Automatically detecting these\nconflicts using a video-based systems is a crucial step in developing smart\ninfrastructure to enhance VRU safety. The Pennsylvania Department of\nTransportation conducted a study using video-based event monitoring system to\nassess VRU and motor vehicle interactions at fifteen signalized intersections\nacross Pennsylvania to improve VRU safety performance. This research builds on\nthat study to assess the reliability of automatically generated surrogates in\npredicting confirmed conflicts using advanced data-driven models. The surrogate\ndata used for analysis include automatically collectable variables such as\nvehicular and VRU speeds, movements, post-encroachment time, in addition to\nmanually collected variables like signal states, lighting, and weather\nconditions. The findings highlight the varying importance of specific\nsurrogates in predicting true conflicts, some being more informative than\nothers. The findings can assist transportation agencies to collect the right\ntypes of data to help prioritize infrastructure investments, such as bike lanes\nand crosswalks, and evaluate their effectiveness.\n","authors":["Agnimitra Sengupta","S. Ilgin Guler","Vikash V. Gayah","Shannon Warchol"],"pdf_url":"https://arxiv.org/pdf/2307.13178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06363v2","updated":"2023-07-24T23:39:15Z","published":"2023-01-16T11:17:32Z","title":"A$^2$-UAV: Application-Aware Content and Network Optimization of\n  Edge-Assisted UAV Systems","summary":"  To perform advanced surveillance, Unmanned Aerial Vehicles (UAVs) require the\nexecution of edge-assisted computer vision (CV) tasks. In multi-hop UAV\nnetworks, the successful transmission of these tasks to the edge is severely\nchallenged due to severe bandwidth constraints. For this reason, we propose a\nnovel A$^2$-UAV framework to optimize the number of correctly executed tasks at\nthe edge. In stark contrast with existing art, we take an application-aware\napproach and formulate a novel pplication-Aware Task Planning Problem\n(A$^2$-TPP) that takes into account (i) the relationship between deep neural\nnetwork (DNN) accuracy and image compression for the classes of interest based\non the available dataset, (ii) the target positions, (iii) the current\nenergy/position of the UAVs to optimize routing, data pre-processing and target\nassignment for each UAV. We demonstrate A$^2$-TPP is NP-Hard and propose a\npolynomial-time algorithm to solve it efficiently. We extensively evaluate\nA$^2$-UAV through real-world experiments with a testbed composed by four DJI\nMavic Air 2 UAVs. We consider state-of-the-art image classification tasks with\nfour different DNN models (i.e., DenseNet, ResNet152, ResNet50 and\nMobileNet-V2) and object detection tasks using YoloV4 trained on the ImageNet\ndataset. Results show that A$^2$-UAV attains on average around 38% more\naccomplished tasks than the state-of-the-art, with 400% more accomplished tasks\nwhen the number of targets increases significantly. To allow full\nreproducibility, we pledge to share datasets and code with the research\ncommunity.\n","authors":["Andrea Coletta","Flavio Giorgi","Gaia Maselli","Matteo Prata","Domenicomichele Silvestri","Jonathan Ashdown","Francesco Restuccia"],"pdf_url":"https://arxiv.org/pdf/2301.06363v2.pdf","comment":"Accepted to INFOCOM 2023"},{"id":"http://arxiv.org/abs/2202.03167v2","updated":"2023-07-24T22:56:21Z","published":"2022-02-07T13:51:19Z","title":"Bayesian Non-stationary Linear Bandits for Large-Scale Recommender\n  Systems","summary":"  Taking advantage of contextual information can potentially boost the\nperformance of recommender systems. In the era of big data, such side\ninformation often has several dimensions. Thus, developing decision-making\nalgorithms to cope with such a high-dimensional context in real time is\nessential. That is specifically challenging when the decision-maker has a\nvariety of items to recommend. In addition, changes in items' popularity or\nusers' preferences can hinder the performance of the deployed recommender\nsystem due to a lack of robustness to distribution shifts in the environment.\nIn this paper, we build upon the linear contextual multi-armed bandit framework\nto address this problem. We develop a decision-making policy for a linear\nbandit problem with high-dimensional feature vectors, a large set of arms, and\nnon-stationary reward-generating processes. Our Thompson sampling-based policy\nreduces the dimension of feature vectors using random projection and uses\nexponentially increasing weights to decrease the influence of past observations\nwith time. Our proposed recommender system employs this policy to learn the\nusers' item preferences online while minimizing runtime. We prove a regret\nbound that scales as a factor of the reduced dimension instead of the original\none. To evaluate our proposed recommender system numerically, we apply it to\nthree real-world datasets. The theoretical and numerical results demonstrate\nthe effectiveness of our proposed algorithm in making a trade-off between\ncomputational complexity and regret performance compared to the\nstate-of-the-art.\n","authors":["Saeed Ghoorchian","Evgenii Kortukov","Setareh Maghsudi"],"pdf_url":"https://arxiv.org/pdf/2202.03167v2.pdf","comment":"30 pages, 12 figures"},{"id":"http://arxiv.org/abs/2307.13158v1","updated":"2023-07-24T22:52:02Z","published":"2023-07-24T22:52:02Z","title":"Multi-UAV Speed Control with Collision Avoidance and Handover-aware Cell\n  Association: DRL with Action Branching","summary":"  This paper presents a deep reinforcement learning solution for optimizing\nmulti-UAV cell-association decisions and their moving velocity on a 3D aerial\nhighway. The objective is to enhance transportation and communication\nperformance, including collision avoidance, connectivity, and handovers. The\nproblem is formulated as a Markov decision process (MDP) with UAVs' states\ndefined by velocities and communication data rates. We propose a neural\narchitecture with a shared decision module and multiple network branches, each\ndedicated to a specific action dimension in a 2D transportation-communication\nspace. This design efficiently handles the multi-dimensional action space,\nallowing independence for individual action dimensions. We introduce two\nmodels, Branching Dueling Q-Network (BDQ) and Branching Dueling Double Deep\nQ-Network (Dueling DDQN), to demonstrate the approach. Simulation results show\na significant improvement of 18.32% compared to existing benchmarks.\n","authors":["Zijiang Yan","Wael Jaafar","Bassant Selim","Hina Tabassum"],"pdf_url":"https://arxiv.org/pdf/2307.13158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13149v1","updated":"2023-07-24T22:22:32Z","published":"2023-07-24T22:22:32Z","title":"Discovering interpretable elastoplasticity models via the neural\n  polynomial method enabled symbolic regressions","summary":"  Conventional neural network elastoplasticity models are often perceived as\nlacking interpretability. This paper introduces a two-step machine-learning\napproach that returns mathematical models interpretable by human experts. In\nparticular, we introduce a surrogate model where yield surfaces are expressed\nin terms of a set of single-variable feature mappings obtained from supervised\nlearning. A postprocessing step is then used to re-interpret the set of\nsingle-variable neural network mapping functions into mathematical form through\nsymbolic regression. This divide-and-conquer approach provides several\nimportant advantages. First, it enables us to overcome the scaling issue of\nsymbolic regression algorithms. From a practical perspective, it enhances the\nportability of learned models for partial differential equation solvers written\nin different programming languages. Finally, it enables us to have a concrete\nunderstanding of the attributes of the materials, such as convexity and\nsymmetries of models, through automated derivations and reasoning. Numerical\nexamples have been provided, along with an open-source code to enable\nthird-party validation.\n","authors":["Bahador Bahmani","Hyoung Suk Suh","WaiChing Sun"],"pdf_url":"https://arxiv.org/pdf/2307.13149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13147v1","updated":"2023-07-24T22:01:22Z","published":"2023-07-24T22:01:22Z","title":"Extending Path-Dependent NJ-ODEs to Noisy Observations and a Dependent\n  Observation Framework","summary":"  The Path-Dependent Neural Jump ODE (PD-NJ-ODE) is a model for predicting\ncontinuous-time stochastic processes with irregular and incomplete\nobservations. In particular, the method learns optimal forecasts given\nirregularly sampled time series of incomplete past observations. So far the\nprocess itself and the coordinate-wise observation times were assumed to be\nindependent and observations were assumed to be noiseless. In this work we\ndiscuss two extensions to lift these restrictions and provide theoretical\nguarantees as well as empirical examples for them.\n","authors":["William Andersson","Jakob Heiss","Florian Krach","Josef Teichmann"],"pdf_url":"https://arxiv.org/pdf/2307.13147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.09736v2","updated":"2023-07-24T21:55:20Z","published":"2022-01-21T00:13:54Z","title":"Tensor and Matrix Low-Rank Value-Function Approximation in Reinforcement\n  Learning","summary":"  Value-function (VF) approximation is a central problem in Reinforcement\nLearning (RL). Classical non-parametric VF estimation suffers from the curse of\ndimensionality. As a result, parsimonious parametric models have been adopted\nto approximate VFs in high-dimensional spaces, with most efforts being focused\non linear and neural-network-based approaches. Differently, this paper puts\nforth a a \\emph{parsimonious non-parametric} approach, where we use\n\\emph{stochastic low-rank algorithms} to estimate the VF matrix in an online\nand model-free fashion. Furthermore, as VFs tend to be multi-dimensional, we\npropose replacing the classical VF matrix representation with a tensor\n(multi-way array) representation and, then, use the PARAFAC decomposition to\ndesign an online model-free tensor low-rank algorithm. Different versions of\nthe algorithms are proposed, their complexity is analyzed, and their\nperformance is assessed numerically using standardized RL environments.\n","authors":["Sergio Rozada","Santiago Paternain","Antonio G. Marques"],"pdf_url":"https://arxiv.org/pdf/2201.09736v2.pdf","comment":"13 pages, 6 figures, 2 table"},{"id":"http://arxiv.org/abs/2206.14284v4","updated":"2023-07-24T21:34:54Z","published":"2022-06-28T20:50:14Z","title":"Optimal Estimation of Generic Dynamics by Path-Dependent Neural Jump\n  ODEs","summary":"  This paper studies the problem of forecasting general stochastic processes\nusing a path-dependent extension of the Neural Jump ODE (NJ-ODE) framework.\nWhile NJ-ODE was the first framework to establish convergence guarantees for\nthe prediction of irregularly observed time series, these results were limited\nto data stemming from It\\^o-diffusions with complete observations, in\nparticular Markov processes where all coordinates are observed simultaneously.\nIn this work, we generalise these results to generic, possibly non-Markovian or\ndiscontinuous, stochastic processes with incomplete observations, by utilising\nthe reconstruction properties of the signature transform. These theoretical\nresults are supported by empirical studies, where it is shown that the\npath-dependent NJ-ODE outperforms the original NJ-ODE framework in the case of\nnon-Markovian data. Moreover, we show that PD-NJ-ODE can be applied\nsuccessfully to classical stochastic filtering problems and to limit order book\n(LOB) data.\n","authors":["Florian Krach","Marc Nübel","Josef Teichmann"],"pdf_url":"https://arxiv.org/pdf/2206.14284v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13136v1","updated":"2023-07-24T21:29:48Z","published":"2023-07-24T21:29:48Z","title":"Does Progress On Object Recognition Benchmarks Improve Real-World\n  Generalization?","summary":"  For more than a decade, researchers have measured progress in object\nrecognition on ImageNet-based generalization benchmarks such as ImageNet-A, -C,\nand -R. Recent advances in foundation models, trained on orders of magnitude\nmore data, have begun to saturate these standard benchmarks, but remain brittle\nin practice. This suggests standard benchmarks, which tend to focus on\npredefined or synthetic changes, may not be sufficient for measuring real world\ngeneralization. Consequently, we propose studying generalization across\ngeography as a more realistic measure of progress using two datasets of objects\nfrom households across the globe. We conduct an extensive empirical evaluation\nof progress across nearly 100 vision models up to most recent foundation\nmodels. We first identify a progress gap between standard benchmarks and\nreal-world, geographical shifts: progress on ImageNet results in up to 2.5x\nmore progress on standard generalization benchmarks than real-world\ndistribution shifts. Second, we study model generalization across geographies\nby measuring the disparities in performance across regions, a more fine-grained\nmeasure of real world generalization. We observe all models have large\ngeographic disparities, even foundation CLIP models, with differences of 7-20%\nin accuracy between regions. Counter to modern intuition, we discover progress\non standard benchmarks fails to improve geographic disparities and often\nexacerbates them: geographic disparities between the least performant models\nand today's best models have more than tripled. Our results suggest scaling\nalone is insufficient for consistent robustness to real-world distribution\nshifts. Finally, we highlight in early experiments how simple last layer\nretraining on more representative, curated data can complement scaling as a\npromising direction of future work, reducing geographic disparity on both\nbenchmarks by over two-thirds.\n","authors":["Megan Richards","Polina Kirichenko","Diane Bouchacourt","Mark Ibrahim"],"pdf_url":"https://arxiv.org/pdf/2307.13136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13133v1","updated":"2023-07-24T21:22:58Z","published":"2023-07-24T21:22:58Z","title":"simPLE: a visuotactile method learned in simulation to precisely pick,\n  localize, regrasp, and place objects","summary":"  Existing robotic systems have a clear tension between generality and\nprecision. Deployed solutions for robotic manipulation tend to fall into the\nparadigm of one robot solving a single task, lacking precise generalization,\ni.e., the ability to solve many tasks without compromising on precision. This\npaper explores solutions for precise and general pick-and-place. In precise\npick-and-place, i.e. kitting, the robot transforms an unstructured arrangement\nof objects into an organized arrangement, which can facilitate further\nmanipulation. We propose simPLE (simulation to Pick Localize and PLacE) as a\nsolution to precise pick-and-place. simPLE learns to pick, regrasp and place\nobjects precisely, given only the object CAD model and no prior experience. We\ndevelop three main components: task-aware grasping, visuotactile perception,\nand regrasp planning. Task-aware grasping computes affordances of grasps that\nare stable, observable, and favorable to placing. The visuotactile perception\nmodel relies on matching real observations against a set of simulated ones\nthrough supervised learning. Finally, we compute the desired robot motion by\nsolving a shortest path problem on a graph of hand-to-hand regrasps. On a\ndual-arm robot equipped with visuotactile sensing, we demonstrate\npick-and-place of 15 diverse objects with simPLE. The objects span a wide range\nof shapes and simPLE achieves successful placements into structured\narrangements with 1mm clearance over 90% of the time for 6 objects, and over\n80% of the time for 11 objects. Videos are available at\nhttp://mcube.mit.edu/research/simPLE.html .\n","authors":["Maria Bauza","Antonia Bronars","Yifan Hou","Ian Taylor","Nikhil Chavan-Dafle","Alberto Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2307.13133v1.pdf","comment":"33 pages, 6 figures, 2 tables, submitted to Science Robotics"},{"id":"http://arxiv.org/abs/2302.12685v2","updated":"2023-07-24T21:11:55Z","published":"2023-02-24T15:21:39Z","title":"Active Membership Inference Attack under Local Differential Privacy in\n  Federated Learning","summary":"  Federated learning (FL) was originally regarded as a framework for\ncollaborative learning among clients with data privacy protection through a\ncoordinating server. In this paper, we propose a new active membership\ninference (AMI) attack carried out by a dishonest server in FL. In AMI attacks,\nthe server crafts and embeds malicious parameters into global models to\neffectively infer whether a target data sample is included in a client's\nprivate training data or not. By exploiting the correlation among data features\nthrough a non-linear decision boundary, AMI attacks with a certified guarantee\nof success can achieve severely high success rates under rigorous local\ndifferential privacy (LDP) protection; thereby exposing clients' training data\nto significant privacy risk. Theoretical and experimental results on several\nbenchmark datasets show that adding sufficient privacy-preserving noise to\nprevent our attack would significantly damage FL's model utility.\n","authors":["Truc Nguyen","Phung Lai","Khang Tran","NhatHai Phan","My T. Thai"],"pdf_url":"https://arxiv.org/pdf/2302.12685v2.pdf","comment":"Published at AISTATS 2023"},{"id":"http://arxiv.org/abs/2307.13127v1","updated":"2023-07-24T21:03:25Z","published":"2023-07-24T21:03:25Z","title":"A Differentially Private Weighted Empirical Risk Minimization Procedure\n  and its Application to Outcome Weighted Learning","summary":"  It is commonplace to use data containing personal information to build\npredictive models in the framework of empirical risk minimization (ERM). While\nthese models can be highly accurate in prediction, results obtained from these\nmodels with the use of sensitive data may be susceptible to privacy attacks.\nDifferential privacy (DP) is an appealing framework for addressing such data\nprivacy issues by providing mathematically provable bounds on the privacy loss\nincurred when releasing information from sensitive data. Previous work has\nprimarily concentrated on applying DP to unweighted ERM. We consider an\nimportant generalization to weighted ERM (wERM). In wERM, each individual's\ncontribution to the objective function can be assigned varying weights. In this\ncontext, we propose the first differentially private wERM algorithm, backed by\na rigorous theoretical proof of its DP guarantees under mild regularity\nconditions. Extending the existing DP-ERM procedures to wERM paves a path to\nderiving privacy-preserving learning methods for individualized treatment\nrules, including the popular outcome weighted learning (OWL). We evaluate the\nperformance of the DP-wERM application to OWL in a simulation study and in a\nreal clinical trial of melatonin for sleep health. All empirical results\ndemonstrate the viability of training OWL models via wERM with DP guarantees\nwhile maintaining sufficiently useful model performance. Therefore, we\nrecommend practitioners consider implementing the proposed privacy-preserving\nOWL procedure in real-world scenarios involving sensitive data.\n","authors":["Spencer Giddens","Yiwang Zhou","Kevin R. Krull","Tara M. Brinkman","Peter X. K. Song","Fang Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13127v1.pdf","comment":"24 pages and 2 figures for the main manuscript, 5 pages and 2 figures\n  for the supplementary materials"},{"id":"http://arxiv.org/abs/2307.11127v2","updated":"2023-07-24T21:01:15Z","published":"2023-07-20T15:52:22Z","title":"Synthetic Control Methods by Density Matching under Implicit Endogeneity","summary":"  Synthetic control methods (SCMs) have become a crucial tool for causal\ninference in comparative case studies. The fundamental idea of SCMs is to\nestimate counterfactual outcomes for a treated unit by using a weighted sum of\nobserved outcomes from untreated units. The accuracy of the synthetic control\n(SC) is critical for estimating the causal effect, and hence, the estimation of\nSC weights has been the focus of much research. In this paper, we first point\nout that existing SCMs suffer from an implicit endogeneity problem, which is\nthe correlation between the outcomes of untreated units and the error term in\nthe model of a counterfactual outcome. We show that this problem yields a bias\nin the causal effect estimator. We then propose a novel SCM based on density\nmatching, assuming that the density of outcomes of the treated unit can be\napproximated by a weighted average of the densities of untreated units (i.e., a\nmixture model). Based on this assumption, we estimate SC weights by matching\nmoments of treated outcomes and the weighted sum of moments of untreated\noutcomes. Our proposed method has three advantages over existing methods.\nFirst, our estimator is asymptotically unbiased under the assumption of the\nmixture model. Second, due to the asymptotic unbiasedness, we can reduce the\nmean squared error for counterfactual prediction. Third, our method generates\nfull densities of the treatment effect, not only expected values, which\nbroadens the applicability of SCMs. We provide experimental results to\ndemonstrate the effectiveness of our proposed method.\n","authors":["Masahiro Kato","Akari Ohda","Masaaki Imaizumi","Kenichiro McAlinn"],"pdf_url":"https://arxiv.org/pdf/2307.11127v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13124v1","updated":"2023-07-24T20:45:39Z","published":"2023-07-24T20:45:39Z","title":"Conformal prediction for frequency-severity modeling","summary":"  We present a nonparametric model-agnostic framework for building prediction\nintervals of insurance claims, with finite sample statistical guarantees,\nextending the technique of split conformal prediction to the domain of\ntwo-stage frequency-severity modeling. The effectiveness of the framework is\nshowcased with simulated and real datasets. When the underlying severity model\nis a random forest, we extend the two-stage split conformal prediction\nprocedure, showing how the out-of-bag mechanism can be leveraged to eliminate\nthe need for a calibration set and to enable the production of prediction\nintervals with adaptive width.\n","authors":["Helton Graziadei","Paulo C. Marques F.","Eduardo F. L. de Melo","Rodrigo S. Targino"],"pdf_url":"https://arxiv.org/pdf/2307.13124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.15498v2","updated":"2023-07-24T20:08:20Z","published":"2021-06-29T15:25:33Z","title":"Classification of Consumer Belief Statements From Social Media","summary":"  Social media offer plenty of information to perform market research in order\nto meet the requirements of customers. One way how this research is conducted\nis that a domain expert gathers and categorizes user-generated content into a\ncomplex and fine-grained class structure. In many of such cases, little data\nmeets complex annotations. It is not yet fully understood how this can be\nleveraged successfully for classification. We examine the classification\naccuracy of expert labels when used with a) many fine-grained classes and b)\nfew abstract classes. For scenario b) we compare abstract class labels given by\nthe domain expert as baseline and by automatic hierarchical clustering. We\ncompare this to another baseline where the entire class structure is given by a\ncompletely unsupervised clustering approach. By doing so, this work can serve\nas an example of how complex expert annotations are potentially beneficial and\ncan be utilized in the most optimal way for opinion mining in highly specific\ndomains. By exploring across a range of techniques and experiments, we find\nthat automated class abstraction approaches in particular the unsupervised\napproach performs remarkably well against domain expert baseline on text\nclassification tasks. This has the potential to inspire opinion mining\napplications in order to support market researchers in practice and to inspire\nfine-grained automated content analysis on a large scale.\n","authors":["Gerhard Johann Hagerer","Wenbin Le","Hannah Danner","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2106.15498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13108v1","updated":"2023-07-24T19:57:21Z","published":"2023-07-24T19:57:21Z","title":"An Explainable Geometric-Weighted Graph Attention Network for\n  Identifying Functional Networks Associated with Gait Impairment","summary":"  One of the hallmark symptoms of Parkinson's Disease (PD) is the progressive\nloss of postural reflexes, which eventually leads to gait difficulties and\nbalance problems. Identifying disruptions in brain function associated with\ngait impairment could be crucial in better understanding PD motor progression,\nthus advancing the development of more effective and personalized therapeutics.\nIn this work, we present an explainable, geometric, weighted-graph attention\nneural network (xGW-GAT) to identify functional networks predictive of the\nprogression of gait difficulties in individuals with PD. xGW-GAT predicts the\nmulti-class gait impairment on the MDS Unified PD Rating Scale (MDS-UPDRS). Our\ncomputational- and data-efficient model represents functional connectomes as\nsymmetric positive definite (SPD) matrices on a Riemannian manifold to\nexplicitly encode pairwise interactions of entire connectomes, based on which\nwe learn an attention mask yielding individual- and group-level explainability.\nApplied to our resting-state functional MRI (rs-fMRI) dataset of individuals\nwith PD, xGW-GAT identifies functional connectivity patterns associated with\ngait impairment in PD and offers interpretable explanations of functional\nsubnetworks associated with motor impairment. Our model successfully\noutperforms several existing methods while simultaneously revealing\nclinically-relevant connectivity patterns. The source code is available at\nhttps://github.com/favour-nerrise/xGW-GAT .\n","authors":["Favour Nerrise","Qingyu Zhao","Kathleen L. Poston","Kilian M. Pohl","Ehsan Adeli"],"pdf_url":"https://arxiv.org/pdf/2307.13108v1.pdf","comment":"Accepted by the 26th International Conference on Medical Image\n  Computing and Computer Assisted Intervention (MICCAI 2023). MICCAI\n  Student-Author Registration (STAR) Award. 11 pages, 2 figures, 1 table,\n  appendix. Source Code: https://github.com/favour-nerrise/xGW-GAT"},{"id":"http://arxiv.org/abs/2111.02326v2","updated":"2023-07-24T19:44:53Z","published":"2021-11-03T16:20:16Z","title":"End-to-End Annotator Bias Approximation on Crowdsourced Single-Label\n  Sentiment Analysis","summary":"  Sentiment analysis is often a crowdsourcing task prone to subjective labels\ngiven by many annotators. It is not yet fully understood how the annotation\nbias of each annotator can be modeled correctly with state-of-the-art methods.\nHowever, resolving annotator bias precisely and reliably is the key to\nunderstand annotators' labeling behavior and to successfully resolve\ncorresponding individual misconceptions and wrongdoings regarding the\nannotation task. Our contribution is an explanation and improvement for precise\nneural end-to-end bias modeling and ground truth estimation, which reduces an\nundesired mismatch in that regard of the existing state-of-the-art.\nClassification experiments show that it has potential to improve accuracy in\ncases where each sample is annotated only by one single annotator. We provide\nthe whole source code publicly and release an own domain-specific sentiment\ndataset containing 10,000 sentences discussing organic food products. These are\ncrawled from social media and are singly labeled by 10 non-expert annotators.\n","authors":["Gerhard Johann Hagerer","David Szabo","Andreas Koch","Maria Luisa Ripoll Dominguez","Christian Widmer","Maximilian Wich","Hannah Danner","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2111.02326v2.pdf","comment":"10 pages, 2 figures, 2 tables, full conference paper, peer-reviewed"},{"id":"http://arxiv.org/abs/2307.13101v1","updated":"2023-07-24T19:43:22Z","published":"2023-07-24T19:43:22Z","title":"Contrastive Example-Based Control","summary":"  While many real-world problems that might benefit from reinforcement\nlearning, these problems rarely fit into the MDP mold: interacting with the\nenvironment is often expensive and specifying reward functions is challenging.\nMotivated by these challenges, prior work has developed data-driven approaches\nthat learn entirely from samples from the transition dynamics and examples of\nhigh-return states. These methods typically learn a reward function from\nhigh-return states, use that reward function to label the transitions, and then\napply an offline RL algorithm to these transitions. While these methods can\nachieve good results on many tasks, they can be complex, often requiring\nregularization and temporal difference updates. In this paper, we propose a\nmethod for offline, example-based control that learns an implicit model of\nmulti-step transitions, rather than a reward function. We show that this\nimplicit model can represent the Q-values for the example-based control\nproblem. Across a range of state-based and image-based offline control tasks,\nour method outperforms baselines that use learned reward functions; additional\nexperiments demonstrate improved robustness and scaling with dataset size.\n","authors":["Kyle Hatch","Benjamin Eysenbach","Rafael Rafailov","Tianhe Yu","Ruslan Salakhutdinov","Sergey Levine","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2307.13101v1.pdf","comment":"This is an updated version of a manuscript that originally appeared\n  at L4DC 2023. The project website is here\n  https://sites.google.com/view/laeo-rl"},{"id":"http://arxiv.org/abs/2307.13100v1","updated":"2023-07-24T19:41:19Z","published":"2023-07-24T19:41:19Z","title":"Label Noise: Correcting a Correction","summary":"  Training neural network classifiers on datasets with label noise poses a risk\nof overfitting them to the noisy labels. To address this issue, researchers\nhave explored alternative loss functions that aim to be more robust. However,\nmany of these alternatives are heuristic in nature and still vulnerable to\noverfitting or underfitting. In this work, we propose a more direct approach to\ntackling overfitting caused by label noise. We observe that the presence of\nlabel noise implies a lower bound on the noisy generalised risk. Building upon\nthis observation, we propose imposing a lower bound on the empirical risk\nduring training to mitigate overfitting. Our main contribution is providing\ntheoretical results that yield explicit, easily computable bounds on the\nminimum achievable noisy risk for different loss functions. We empirically\ndemonstrate that using these bounds significantly enhances robustness in\nvarious settings, with virtually no additional computational cost.\n","authors":["William Toner","Amos Storkey"],"pdf_url":"https://arxiv.org/pdf/2307.13100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10736v2","updated":"2023-07-24T19:28:46Z","published":"2023-07-20T10:03:50Z","title":"Long-Tail Theory under Gaussian Mixtures","summary":"  We suggest a simple Gaussian mixture model for data generation that complies\nwith Feldman's long tail theory (2020). We demonstrate that a linear classifier\ncannot decrease the generalization error below a certain level in the proposed\nmodel, whereas a nonlinear classifier with a memorization capacity can. This\nconfirms that for long-tailed distributions, rare training examples must be\nconsidered for optimal generalization to new data. Finally, we show that the\nperformance gap between linear and nonlinear models can be lessened as the tail\nbecomes shorter in the subpopulation frequency distribution, as confirmed by\nexperiments on synthetic and real data.\n","authors":["Arman Bolatov","Maxat Tezekbayev","Igor Melnykov","Artur Pak","Vassilina Nikoulina","Zhenisbek Assylbekov"],"pdf_url":"https://arxiv.org/pdf/2307.10736v2.pdf","comment":"accepted to ECAI 2023"},{"id":"http://arxiv.org/abs/2307.05946v2","updated":"2023-07-24T19:13:51Z","published":"2023-07-12T06:23:31Z","title":"A Bayesian approach to quantifying uncertainties and improving\n  generalizability in traffic prediction models","summary":"  Deep-learning models for traffic data prediction can have superior\nperformance in modeling complex functions using a multi-layer architecture.\nHowever, a major drawback of these approaches is that most of these approaches\ndo not offer forecasts with uncertainty estimates, which are essential for\ntraffic operations and control. Without uncertainty estimates, it is difficult\nto place any level of trust to the model predictions, and operational\nstrategies relying on overconfident predictions can lead to worsening traffic\nconditions. In this study, we propose a Bayesian recurrent neural network\nframework for uncertainty quantification in traffic prediction with higher\ngeneralizability by introducing spectral normalization to its hidden layers. In\nour paper, we have shown that normalization alters the training process of deep\nneural networks by controlling the model's complexity and reducing the risk of\noverfitting to the training data. This, in turn, helps improve the\ngeneralization performance of the model on out-of-distribution datasets.\nResults demonstrate that spectral normalization improves uncertainty estimates\nand significantly outperforms both the layer normalization and model without\nnormalization in single-step prediction horizons. This improved performance can\nbe attributed to the ability of spectral normalization to better localize the\nfeature space of the data under perturbations. Our findings are especially\nrelevant to traffic management applications, where predicting traffic\nconditions across multiple locations is the goal, but the availability of\ntraining data from multiple locations is limited. Spectral normalization,\ntherefore, provides a more generalizable approach that can effectively capture\nthe underlying patterns in traffic data without requiring location-specific\nmodels.\n","authors":["Agnimitra Sengupta","Sudeepta Mondal","Adway Das","S. Ilgin Guler"],"pdf_url":"https://arxiv.org/pdf/2307.05946v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13081v1","updated":"2023-07-24T19:07:34Z","published":"2023-07-24T19:07:34Z","title":"Fairness Under Demographic Scarce Regime","summary":"  Most existing works on fairness assume the model has full access to\ndemographic information. However, there exist scenarios where demographic\ninformation is partially available because a record was not maintained\nthroughout data collection or due to privacy reasons. This setting is known as\ndemographic scarce regime. Prior research have shown that training an attribute\nclassifier to replace the missing sensitive attributes (proxy) can still\nimprove fairness. However, the use of proxy-sensitive attributes worsens\nfairness-accuracy trade-offs compared to true sensitive attributes. To address\nthis limitation, we propose a framework to build attribute classifiers that\nachieve better fairness-accuracy trade-offs. Our method introduces uncertainty\nawareness in the attribute classifier and enforces fairness on samples with\ndemographic information inferred with the lowest uncertainty. We show\nempirically that enforcing fairness constraints on samples with uncertain\nsensitive attributes is detrimental to fairness and accuracy. Our experiments\non two datasets showed that the proposed framework yields models with\nsignificantly better fairness-accuracy trade-offs compared to classic attribute\nclassifiers. Surprisingly, our framework outperforms models trained with\nconstraints on the true sensitive attributes.\n","authors":["Patrik Joslin Kenfack","Samira Ebrahimi Kahou","Ulrich Aïvodji"],"pdf_url":"https://arxiv.org/pdf/2307.13081v1.pdf","comment":"14 pages, 7 pages"},{"id":"http://arxiv.org/abs/2307.08811v2","updated":"2023-07-24T19:03:32Z","published":"2023-07-17T19:57:10Z","title":"DeepMem: ML Models as storage channels and their (mis-)applications","summary":"  Machine learning (ML) models are overparameterized to support generality and\navoid overfitting. Prior works have shown that these additional parameters can\nbe used for both malicious (e.g., hiding a model covertly within a trained\nmodel) and beneficial purposes (e.g., watermarking a model). In this paper, we\npropose a novel information theoretic perspective of the problem; we consider\nthe ML model as a storage channel with a capacity that increases with\noverparameterization. Specifically, we consider a sender that embeds arbitrary\ninformation in the model at training time, which can be extracted by a receiver\nwith a black-box access to the deployed model. We derive an upper bound on the\ncapacity of the channel based on the number of available parameters. We then\nexplore black-box write and read primitives that allow the attacker to: (i)\nstore data in an optimized way within the model by augmenting the training data\nat the transmitter side, and (ii) to read it by querying the model after it is\ndeployed. We also analyze the detectability of the writing primitive and\nconsider a new version of the problem which takes information storage\ncovertness into account. Specifically, to obtain storage covertness, we\nintroduce a new constraint such that the data augmentation used for the write\nprimitives minimizes the distribution shift with the initial (baseline task)\ndistribution. This constraint introduces a level of \"interference\" with the\ninitial task, thereby limiting the channel's effective capacity. Therefore, we\ndevelop optimizations to improve the capacity in this case, including a novel\nML-specific substitution based error correction protocol. We believe that the\nproposed modeling of the problem offers new tools to better understand and\nmitigate potential vulnerabilities of ML, especially in the context of\nincreasingly large models.\n","authors":["Md Abdullah Al Mamun","Quazi Mishkatul Alam","Erfan Shaigani","Pedram Zaree","Ihsen Alouani","Nael Abu-Ghazaleh"],"pdf_url":"https://arxiv.org/pdf/2307.08811v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13078v1","updated":"2023-07-24T18:59:46Z","published":"2023-07-24T18:59:46Z","title":"Adaptive Certified Training: Towards Better Accuracy-Robustness\n  Tradeoffs","summary":"  As deep learning models continue to advance and are increasingly utilized in\nreal-world systems, the issue of robustness remains a major challenge. Existing\ncertified training methods produce models that achieve high provable robustness\nguarantees at certain perturbation levels. However, the main problem of such\nmodels is a dramatically low standard accuracy, i.e. accuracy on clean\nunperturbed data, that makes them impractical. In this work, we consider a more\nrealistic perspective of maximizing the robustness of a model at certain levels\nof (high) standard accuracy. To this end, we propose a novel certified training\nmethod based on a key insight that training with adaptive certified radii helps\nto improve both the accuracy and robustness of the model, advancing\nstate-of-the-art accuracy-robustness tradeoffs. We demonstrate the\neffectiveness of the proposed method on MNIST, CIFAR-10, and TinyImageNet\ndatasets. Particularly, on CIFAR-10 and TinyImageNet, our method yields models\nwith up to two times higher robustness, measured as an average certified radius\nof a test set, at the same levels of standard accuracy compared to baseline\napproaches.\n","authors":["Zhakshylyk Nurlanov","Frank R. Schmidt","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2307.13078v1.pdf","comment":"Presented at ICML 2023 workshop \"New Frontiers in Adversarial Machine\n  Learning\""},{"id":"http://arxiv.org/abs/2307.13069v1","updated":"2023-07-24T18:50:49Z","published":"2023-07-24T18:50:49Z","title":"General-Purpose Multi-Modal OOD Detection Framework","summary":"  Out-of-distribution (OOD) detection identifies test samples that differ from\nthe training data, which is critical to ensuring the safety and reliability of\nmachine learning (ML) systems. While a plethora of methods have been developed\nto detect uni-modal OOD samples, only a few have focused on multi-modal OOD\ndetection. Current contrastive learning-based methods primarily study\nmulti-modal OOD detection in a scenario where both a given image and its\ncorresponding textual description come from a new domain. However, real-world\ndeployments of ML systems may face more anomaly scenarios caused by multiple\nfactors like sensor faults, bad weather, and environmental changes. Hence, the\ngoal of this work is to simultaneously detect from multiple different OOD\nscenarios in a fine-grained manner. To reach this goal, we propose a\ngeneral-purpose weakly-supervised OOD detection framework, called WOOD, that\ncombines a binary classifier and a contrastive learning component to reap the\nbenefits of both. In order to better distinguish the latent representations of\nin-distribution (ID) and OOD samples, we adopt the Hinge loss to constrain\ntheir similarity. Furthermore, we develop a new scoring metric to integrate the\nprediction results from both the binary classifier and contrastive learning for\nidentifying OOD samples. We evaluate the proposed WOOD model on multiple\nreal-world datasets, and the experimental results demonstrate that the WOOD\nmodel outperforms the state-of-the-art methods for multi-modal OOD detection.\nImportantly, our approach is able to achieve high accuracy in OOD detection in\nthree different OOD scenarios simultaneously. The source code will be made\npublicly available upon publication.\n","authors":["Viet Duong","Qiong Wu","Zhengyi Zhou","Eric Zavesky","Jiahe Chen","Xiangzhou Liu","Wen-Ling Hsu","Huajie Shao"],"pdf_url":"https://arxiv.org/pdf/2307.13069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12386v4","updated":"2023-07-24T18:42:08Z","published":"2023-01-29T07:45:17Z","title":"Plugin estimators for selective classification with out-of-distribution\n  detection","summary":"  Real-world classifiers can benefit from the option of abstaining from\npredicting on samples where they have low confidence. Such abstention is\nparticularly useful on samples which are close to the learned decision\nboundary, or which are outliers with respect to the training sample. These\nsettings have been the subject of extensive but disjoint study in the selective\nclassification (SC) and out-of-distribution (OOD) detection literature. Recent\nwork on selective classification with OOD detection (SCOD) has argued for the\nunified study of these problems; however, the formal underpinnings of this\nproblem are still nascent, and existing techniques are heuristic in nature. In\nthis paper, we propose new plugin estimators for SCOD that are theoretically\ngrounded, effective, and generalise existing approaches from the SC and OOD\ndetection literature. In the course of our analysis, we formally explicate how\nna\\\"{i}ve use of existing SC and OOD detection baselines may be inadequate for\nSCOD. We empirically demonstrate that our approaches yields competitive SC and\nOOD detection performance compared to baselines from both literatures.\n","authors":["Harikrishna Narasimhan","Aditya Krishna Menon","Wittawat Jitkrittum","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2301.12386v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13061v1","updated":"2023-07-24T18:25:59Z","published":"2023-07-24T18:25:59Z","title":"Feature Gradient Flow for Interpreting Deep Neural Networks in Head and\n  Neck Cancer Prediction","summary":"  This paper introduces feature gradient flow, a new technique for interpreting\ndeep learning models in terms of features that are understandable to humans.\nThe gradient flow of a model locally defines nonlinear coordinates in the input\ndata space representing the information the model is using to make its\ndecisions. Our idea is to measure the agreement of interpretable features with\nthe gradient flow of a model. To then evaluate the importance of a particular\nfeature to the model, we compare that feature's gradient flow measure versus\nthat of a baseline noise feature. We then develop a technique for training\nneural networks to be more interpretable by adding a regularization term to the\nloss function that encourages the model gradients to align with those of chosen\ninterpretable features. We test our method in a convolutional neural network\nprediction of distant metastasis of head and neck cancer from a computed\ntomography dataset from the Cancer Imaging Archive.\n","authors":["Yinzhu Jin","Jonathan C. Garneau","P. Thomas Fletcher"],"pdf_url":"https://arxiv.org/pdf/2307.13061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.09885v2","updated":"2023-07-24T18:12:19Z","published":"2021-11-18T18:59:35Z","title":"Optimal Simple Regret in Bayesian Best Arm Identification","summary":"  We consider best arm identification in the multi-armed bandit problem.\nAssuming certain continuity conditions of the prior, we characterize the rate\nof the Bayesian simple regret. Differing from Bayesian regret minimization\n(Lai, 1987), the leading term in the Bayesian simple regret derives from the\nregion where the gap between optimal and suboptimal arms is smaller than\n$\\sqrt{\\frac{\\log T}{T}}$. We propose a simple and easy-to-compute algorithm\nwith its leading term matching with the lower bound up to a constant factor;\nsimulation results support our theoretical findings.\n","authors":["Junpei Komiyama","Kaito Ariu","Masahiro Kato","Chao Qin"],"pdf_url":"https://arxiv.org/pdf/2111.09885v2.pdf","comment":"to appear in Mathematics of Operations Research"},{"id":"http://arxiv.org/abs/2305.17289v2","updated":"2023-07-24T18:10:47Z","published":"2023-05-26T22:17:28Z","title":"Fourier-DeepONet: Fourier-enhanced deep operator networks for full\n  waveform inversion with improved accuracy, generalizability, and robustness","summary":"  Full waveform inversion (FWI) infers the subsurface structure information\nfrom seismic waveform data by solving a non-convex optimization problem.\nData-driven FWI has been increasingly studied with various neural network\narchitectures to improve accuracy and computational efficiency. Nevertheless,\nthe applicability of pre-trained neural networks is severely restricted by\npotential discrepancies between the source function used in the field survey\nand the one utilized during training. Here, we develop a Fourier-enhanced deep\noperator network (Fourier-DeepONet) for FWI with the generalization of seismic\nsources, including the frequencies and locations of sources. Specifically, we\nemploy the Fourier neural operator as the decoder of DeepONet, and we utilize\nsource parameters as one input of Fourier-DeepONet, facilitating the resolution\nof FWI with variable sources. To test Fourier-DeepONet, we develop three new\nand realistic FWI benchmark datasets (FWI-F, FWI-L, and FWI-FL) with varying\nsource frequencies, locations, or both. Our experiments demonstrate that\ncompared with existing data-driven FWI methods, Fourier-DeepONet obtains more\naccurate predictions of subsurface structures in a wide range of source\nparameters. Moreover, the proposed Fourier-DeepONet exhibits superior\nrobustness when handling data with Gaussian noise or missing traces and sources\nwith Gaussian noise, paving the way for more reliable and accurate subsurface\nimaging across diverse real conditions.\n","authors":["Min Zhu","Shihang Feng","Youzuo Lin","Lu Lu"],"pdf_url":"https://arxiv.org/pdf/2305.17289v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04759v2","updated":"2023-07-24T18:10:09Z","published":"2023-04-07T23:10:39Z","title":"Similarity search in the blink of an eye with compressed indices","summary":"  Nowadays, data is represented by vectors. Retrieving those vectors, among\nmillions and billions, that are similar to a given query is a ubiquitous\nproblem, known as similarity search, of relevance for a wide range of\napplications. Graph-based indices are currently the best performing techniques\nfor billion-scale similarity search. However, their random-access memory\npattern presents challenges to realize their full potential. In this work, we\npresent new techniques and systems for creating faster and smaller graph-based\nindices. To this end, we introduce a novel vector compression method,\nLocally-adaptive Vector Quantization (LVQ), that uses per-vector scaling and\nscalar quantization to improve search performance with fast similarity\ncomputations and a reduced effective bandwidth, while decreasing memory\nfootprint and barely impacting accuracy. LVQ, when combined with a new\nhigh-performance computing system for graph-based similarity search,\nestablishes the new state of the art in terms of performance and memory\nfootprint. For billions of vectors, LVQ outcompetes the second-best\nalternatives: (1) in the low-memory regime, by up to 20.7x in throughput with\nup to a 3x memory footprint reduction, and (2) in the high-throughput regime by\n5.8x with 1.4x less memory.\n","authors":["Cecilia Aguerrebere","Ishwar Bhati","Mark Hildebrand","Mariano Tepper","Ted Willke"],"pdf_url":"https://arxiv.org/pdf/2304.04759v2.pdf","comment":"VLDB 2023"}]},"2023-07-23T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2303.04496v2","updated":"2023-07-23T21:36:14Z","published":"2023-03-08T10:39:38Z","title":"MenuCraft: Interactive Menu System Design with Large Language Models","summary":"  Menu system design is a challenging task involving many design options and\nvarious human factors. For example, one crucial factor that designers need to\nconsider is the semantic and systematic relation of menu commands. However,\ncapturing these relations can be challenging due to limited available\nresources. With the advancement of neural language models, large language\nmodels can utilize their vast pre-existing knowledge in designing and refining\nmenu systems. In this paper, we propose MenuCraft, an AI-assisted designer for\nmenu design that enables collaboration between the designer and a dialogue\nsystem to design menus. MenuCraft offers an interactive language-based menu\ndesign tool that simplifies the menu design process and enables easy\ncustomization of design options. MenuCraft supports a variety of interactions\nthrough dialog that allows performing zero/few-shot learning.\n","authors":["Amir Hossein Kargaran","Nafiseh Nikeghbal","Abbas Heydarnoori","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2303.04496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12425v1","updated":"2023-07-23T20:43:21Z","published":"2023-07-23T20:43:21Z","title":"On the Effectiveness of Offline RL for Dialogue Response Generation","summary":"  A common training technique for language models is teacher forcing (TF). TF\nattempts to match human language exactly, even though identical meanings can be\nexpressed in different ways. This motivates use of sequence-level objectives\nfor dialogue response generation. In this paper, we study the efficacy of\nvarious offline reinforcement learning (RL) methods to maximize such\nobjectives. We present a comprehensive evaluation across multiple datasets,\nmodels, and metrics. Offline RL shows a clear performance improvement over\nteacher forcing while not inducing training instability or sacrificing\npractical training budgets.\n","authors":["Paloma Sodhi","Felix Wu","Ethan R. Elenberg","Kilian Q. Weinberger","Ryan McDonald"],"pdf_url":"https://arxiv.org/pdf/2307.12425v1.pdf","comment":"Accepted at ICML 2023. 18 pages, 12 figures. Code available at\n  https://github.com/asappresearch/dialogue-offline-rl"},{"id":"http://arxiv.org/abs/2307.12418v1","updated":"2023-07-23T20:08:38Z","published":"2023-07-23T20:08:38Z","title":"Testing Hateful Speeches against Policies","summary":"  In the recent years, many software systems have adopted AI techniques,\nespecially deep learning techniques. Due to their black-box nature, AI-based\nsystems brought challenges to traceability, because AI system behaviors are\nbased on models and data, whereas the requirements or policies are rules in the\nform of natural or programming language. To the best of our knowledge, there is\na limited amount of studies on how AI and deep neural network-based systems\nbehave against rule-based requirements/policies. This experience paper examines\ndeep neural network behaviors against rule-based requirements described in\nnatural language policies. In particular, we focus on a case study to check\nAI-based content moderation software against content moderation policies.\nFirst, using crowdsourcing, we collect natural language test cases which match\neach moderation policy, we name this dataset HateModerate; second, using the\ntest cases in HateModerate, we test the failure rates of state-of-the-art hate\nspeech detection software, and we find that these models have high failure\nrates for certain policies; finally, since manual labeling is costly, we\nfurther proposed an automated approach to augument HateModerate by finetuning\nOpenAI's large language models to automatically match new examples to policies.\nThe dataset and code of this work can be found on our anonymous website:\n\\url{https://sites.google.com/view/content-moderation-project}.\n","authors":["Jiangrui Zheng","Xueqing Liu","Girish Budhrani","Wei Yang","Ravishka Rathnasuriya"],"pdf_url":"https://arxiv.org/pdf/2307.12418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16259v5","updated":"2023-07-23T20:00:46Z","published":"2023-05-25T17:13:44Z","title":"Neural Natural Language Processing for Long Texts: A Survey of the\n  State-of-the-Art","summary":"  The adoption of Deep Neural Networks (DNNs) has greatly benefited Natural\nLanguage Processing (NLP) during the past decade. However, the demands of long\ndocument analysis are quite different from those of shorter texts, while the\never increasing size of documents uploaded on-line renders automated\nunderstanding of lengthy texts a critical issue. Relevant applications include\nautomated Web mining, legal document review, medical records analysis,\nfinancial reports analysis, contract management, environmental impact\nassessment, news aggregation, etc. Despite the relatively recent development of\nefficient algorithms for analyzing long documents, practical tools in this\nfield are currently flourishing. This article serves as an entry point into\nthis dynamic domain and aims to achieve two objectives. Firstly, it provides an\noverview of the relevant neural building blocks, serving as a concise tutorial\nfor the field. Secondly, it offers a brief examination of the current\nstate-of-the-art in long document NLP, with a primary focus on two key tasks:\ndocument classification and document summarization. Sentiment analysis for long\ntexts is also covered, since it is typically treated as a particular case of\ndocument classification. Consequently, this article presents an introductory\nexploration of document-level analysis, addressing the primary challenges,\nconcerns, and existing solutions. Finally, the article presents publicly\navailable annotated datasets that can facilitate further research in this area.\n","authors":["Dimitrios Tsirmpas","Ioannis Gkionis","Ioannis Mademlis","Georgios Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2305.16259v5.pdf","comment":"58 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2210.10012v3","updated":"2023-07-23T19:50:41Z","published":"2022-10-18T17:30:02Z","title":"Log-linear Guardedness and its Implications","summary":"  Methods for erasing human-interpretable concepts from neural representations\nthat assume linearity have been found to be tractable and useful. However, the\nimpact of this removal on the behavior of downstream classifiers trained on the\nmodified representations is not fully understood. In this work, we formally\ndefine the notion of log-linear guardedness as the inability of an adversary to\npredict the concept directly from the representation, and study its\nimplications. We show that, in the binary case, under certain assumptions, a\ndownstream log-linear model cannot recover the erased concept. However, we\ndemonstrate that a multiclass log-linear model \\emph{can} be constructed that\nindirectly recovers the concept in some cases, pointing to the inherent\nlimitations of log-linear guardedness as a downstream bias mitigation\ntechnique. These findings shed light on the theoretical limitations of linear\nerasure methods and highlight the need for further research on the connections\nbetween intrinsic and extrinsic bias in neural models.\n","authors":["Shauli Ravfogel","Yoav Goldberg","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2210.10012v3.pdf","comment":"Accepted as a long paper in ACL 2023"},{"id":"http://arxiv.org/abs/2307.12382v1","updated":"2023-07-23T17:16:13Z","published":"2023-07-23T17:16:13Z","title":"CommonsenseVIS: Visualizing and Understanding Commonsense Reasoning\n  Capabilities of Natural Language Models","summary":"  Recently, large pretrained language models have achieved compelling\nperformance on commonsense benchmarks. Nevertheless, it is unclear what\ncommonsense knowledge the models learn and whether they solely exploit spurious\npatterns. Feature attributions are popular explainability techniques that\nidentify important input concepts for model outputs. However, commonsense\nknowledge tends to be implicit and rarely explicitly presented in inputs. These\nmethods cannot infer models' implicit reasoning over mentioned concepts. We\npresent CommonsenseVIS, a visual explanatory system that utilizes external\ncommonsense knowledge bases to contextualize model behavior for commonsense\nquestion-answering. Specifically, we extract relevant commonsense knowledge in\ninputs as references to align model behavior with human knowledge. Our system\nfeatures multi-level visualization and interactive model probing and editing\nfor different concepts and their underlying relations. Through a user study, we\nshow that CommonsenseVIS helps NLP experts conduct a systematic and scalable\nvisual analysis of models' relational reasoning over concepts in different\nsituations.\n","authors":["Xingbo Wang","Renfei Huang","Zhihua Jin","Tianqing Fang","Huamin Qu"],"pdf_url":"https://arxiv.org/pdf/2307.12382v1.pdf","comment":"This paper is accepted by IEEE VIS, 2023. To appear in IEEE\n  Transactions on Visualization and Computer Graphics (IEEE TVCG). 14 pages, 11\n  figures"},{"id":"http://arxiv.org/abs/2307.03691v2","updated":"2023-07-23T17:05:06Z","published":"2023-07-05T23:19:18Z","title":"Comparing Apples to Apples: Generating Aspect-Aware Comparative\n  Sentences from User Reviews","summary":"  It is time-consuming to find the best product among many similar\nalternatives. Comparative sentences can help to contrast one item from others\nin a way that highlights important features of an item that stand out. Given\nreviews of one or multiple items and relevant item features, we generate\ncomparative review sentences to aid users to find the best fit. Specifically,\nour model consists of three successive components in a transformer: (i) an item\nencoding module to encode an item for comparison, (ii) a comparison generation\nmodule that generates comparative sentences in an autoregressive manner, (iii)\na novel decoding method for user personalization. We show that our pipeline\ngenerates fluent and diverse comparative sentences. We run experiments on the\nrelevance and fidelity of our generated sentences in a human evaluation study\nand find that our algorithm creates comparative review sentences that are\nrelevant and truthful.\n","authors":["Jessica Echterhoff","An Yan","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2307.03691v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12375v1","updated":"2023-07-23T16:54:41Z","published":"2023-07-23T16:54:41Z","title":"In-Context Learning in Large Language Models Learns Label Relationships\n  but Is Not Conventional Learning","summary":"  The performance of Large Language Models (LLMs) on downstream tasks often\nimproves significantly when including examples of the input-label relationship\nin the context. However, there is currently no consensus about how this\nin-context learning (ICL) ability of LLMs works: for example, while Xie et al.\n(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)\nargue ICL does not even learn label relationships from in-context examples. In\nthis paper, we study (1) how labels of in-context examples affect predictions,\n(2) how label relationships learned during pre-training interact with\ninput-label examples provided in-context, and (3) how ICL aggregates label\ninformation across in-context examples. Our findings suggests LLMs usually\nincorporate information from in-context labels, but that pre-training and\nin-context label relationships are treated differently, and that the model does\nnot consider all in-context information equally. Our results give insights into\nunderstanding and aligning LLM behavior.\n","authors":["Jannik Kossen","Tom Rainforth","Yarin Gal"],"pdf_url":"https://arxiv.org/pdf/2307.12375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11019v2","updated":"2023-07-23T16:52:59Z","published":"2023-07-20T16:46:10Z","title":"Investigating the Factual Knowledge Boundary of Large Language Models\n  with Retrieval Augmentation","summary":"  Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require\na substantial amount of factual knowledge and often rely on external\ninformation for assistance. Recently, large language models (LLMs) (e.g.,\nChatGPT), have demonstrated impressive prowess in solving a wide range of tasks\nwith world knowledge, including knowledge-intensive tasks. However, it remains\nunclear how well LLMs are able to perceive their factual knowledge boundaries,\nparticularly how they behave when incorporating retrieval augmentation. In this\nstudy, we present an initial analysis of the factual knowledge boundaries of\nLLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially,\nwe focus on three primary research questions and analyze them by examining QA\nperformance, priori judgement and posteriori judgement of LLMs. We show\nevidence that LLMs possess unwavering confidence in their capabilities to\nrespond to questions and the accuracy of their responses. Furthermore,\nretrieval augmentation proves to be an effective approach in enhancing LLMs'\nawareness of knowledge boundaries, thereby improving their judgemental\nabilities. Additionally, we also find that LLMs have a propensity to rely on\nthe provided retrieval results when formulating answers, while the quality of\nthese results significantly impacts their reliance. The code to reproduce this\nwork is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary.\n","authors":["Ruiyang Ren","Yuhao Wang","Yingqi Qu","Wayne Xin Zhao","Jing Liu","Hao Tian","Hua Wu","Ji-Rong Wen","Haifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11019v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12371v1","updated":"2023-07-23T16:46:01Z","published":"2023-07-23T16:46:01Z","title":"Evaluating Emotional Nuances in Dialogue Summarization","summary":"  Automatic dialogue summarization is a well-established task that aims to\nidentify the most important content from human conversations to create a short\ntextual summary. Despite recent progress in the field, we show that most of the\nresearch has focused on summarizing the factual information, leaving aside the\naffective content, which can yet convey useful information to analyse, monitor,\nor support human interactions. In this paper, we propose and evaluate a set of\nmeasures $PEmo$, to quantify how much emotion is preserved in dialog summaries.\nResults show that, summarization models of the state-of-the-art do not preserve\nwell the emotional content in the summaries. We also show that by reducing the\ntraining set to only emotional dialogues, the emotional content is better\npreserved in the generated summaries, while conserving the most salient factual\ninformation.\n","authors":["Yongxin Zhou","Fabien Ringeval","François Portet"],"pdf_url":"https://arxiv.org/pdf/2307.12371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12369v1","updated":"2023-07-23T16:38:16Z","published":"2023-07-23T16:38:16Z","title":"Early Prediction of Alzheimers Disease Leveraging Symptom Occurrences\n  from Longitudinal Electronic Health Records of US Military Veterans","summary":"  Early prediction of Alzheimer's disease (AD) is crucial for timely\nintervention and treatment. This study aims to use machine learning approaches\nto analyze longitudinal electronic health records (EHRs) of patients with AD\nand identify signs and symptoms that can predict AD onset earlier. We used a\ncase-control design with longitudinal EHRs from the U.S. Department of Veterans\nAffairs Veterans Health Administration (VHA) from 2004 to 2021. Cases were VHA\npatients with AD diagnosed after 1/1/2016 based on ICD-10-CM codes, matched 1:9\nwith controls by age, sex and clinical utilization with replacement. We used a\npanel of AD-related keywords and their occurrences over time in a patient's\nlongitudinal EHRs as predictors for AD prediction with four machine learning\nmodels. We performed subgroup analyses by age, sex, and race/ethnicity, and\nvalidated the model in a hold-out and \"unseen\" VHA stations group. Model\ndiscrimination, calibration, and other relevant metrics were reported for\npredictions up to ten years before ICD-based diagnosis. The study population\nincluded 16,701 cases and 39,097 matched controls. The average number of\nAD-related keywords (e.g., \"concentration\", \"speaking\") per year increased\nrapidly for cases as diagnosis approached, from around 10 to over 40, while\nremaining flat at 10 for controls. The best model achieved high discriminative\naccuracy (ROCAUC 0.997) for predictions using data from at least ten years\nbefore ICD-based diagnoses. The model was well-calibrated (Hosmer-Lemeshow\ngoodness-of-fit p-value = 0.99) and consistent across subgroups of age, sex and\nrace/ethnicity, except for patients younger than 65 (ROCAUC 0.746). Machine\nlearning models using AD-related keywords identified from EHR notes can predict\nfuture AD diagnoses, suggesting its potential use for identifying AD risk using\nEHR notes, offering an affordable way for early screening on large population.\n","authors":["Rumeng Li","Xun Wang","Dan Berlowitz","Brian Silver","Wen Hu","Heather Keating","Raelene Goodwin","Weisong Liu","Honghuang Lin","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2307.12369v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2111.09564v3","updated":"2023-07-23T16:02:01Z","published":"2021-11-18T07:46:35Z","title":"LAnoBERT: System Log Anomaly Detection based on BERT Masked Language\n  Model","summary":"  The system log generated in a computer system refers to large-scale data that\nare collected simultaneously and used as the basic data for determining errors,\nintrusion and abnormal behaviors. The aim of system log anomaly detection is to\npromptly identify anomalies while minimizing human intervention, which is a\ncritical problem in the industry. Previous studies performed anomaly detection\nthrough algorithms after converting various forms of log data into a\nstandardized template using a parser. Particularly, a template corresponding to\na specific event should be defined in advance for all the log data using which\nthe information within the log key may get lost. In this study, we propose\nLAnoBERT, a parser free system log anomaly detection method that uses the BERT\nmodel, exhibiting excellent natural language processing performance. The\nproposed method, LAnoBERT, learns the model through masked language modeling,\nwhich is a BERT-based pre-training method, and proceeds with unsupervised\nlearning-based anomaly detection using the masked language modeling loss\nfunction per log key during the test process. In addition, we also propose an\nefficient inference process to establish a practically applicable pipeline to\nthe actual system. Experiments on three well-known log datasets, i.e., HDFS,\nBGL, and Thunderbird, show that not only did LAnoBERT yield a higher anomaly\ndetection performance compared to unsupervised learning-based benchmark models,\nbut also it resulted in a comparable performance with supervised learning-based\nbenchmark models.\n","authors":["Yukyung Lee","Jina Kim","Pilsung Kang"],"pdf_url":"https://arxiv.org/pdf/2111.09564v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05303v2","updated":"2023-07-23T15:11:31Z","published":"2023-04-11T15:54:25Z","title":"ELVIS: Empowering Locality of Vision Language Pre-training with\n  Intra-modal Similarity","summary":"  Deep learning has shown great potential in assisting radiologists in reading\nchest X-ray (CXR) images, but its need for expensive annotations for improving\nperformance prevents widespread clinical application. Visual language\npre-training (VLP) can alleviate the burden and cost of annotation by\nleveraging routinely generated reports for radiographs, which exist in large\nquantities as well as in paired form (image-text pairs). Additionally,\nextensions to localization-aware VLPs are being proposed to address the needs\nfor accurate localization of abnormalities for computer-aided diagnosis (CAD)\nin CXR. However, we find that the formulation proposed by locality-aware VLP\nliterature actually leads to a loss in spatial relationships required for\ndownstream localization tasks. Therefore, we propose Empowering Locality of VLP\nwith Intra-modal Similarity, ELVIS, a VLP aware of intra-modal locality, to\nbetter preserve the locality within radiographs or reports, which enhances the\nability to comprehend location references in text reports. Our locality-aware\nVLP method significantly outperforms state-of-the art baselines in multiple\nsegmentation tasks and the MS-CXR phrase grounding task. Qualitatively, we show\nthat ELVIS focuses well on regions of interest described in the report text\ncompared to prior approaches, allowing for enhanced interpretability.\n","authors":["Sumin Seo","JaeWoong Shin","Jaewoo Kang","Tae Soo Kim","Thijs Kooi"],"pdf_url":"https://arxiv.org/pdf/2304.05303v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2207.03858v2","updated":"2023-07-23T14:52:20Z","published":"2022-07-08T12:27:19Z","title":"DSTEA: Improving Dialogue State Tracking via Entity Adaptive\n  Pre-training","summary":"  Dialogue State Tracking (DST) is critical for comprehensively interpreting\nuser and system utterances, thereby forming the cornerstone of efficient\ndialogue systems. Despite past research efforts focused on enhancing DST\nperformance through alterations to the model structure or integrating\nadditional features like graph relations, they often require additional\npre-training with external dialogue corpora. In this study, we propose DSTEA,\nimproving Dialogue State Tracking via Entity Adaptive pre-training, which can\nenhance the encoder through by intensively training key entities in dialogue\nutterances. DSTEA identifies these pivotal entities from input dialogues\nutilizing four different methods: ontology information, named-entity\nrecognition, the spaCy, and the flair library. Subsequently, it employs\nselective knowledge masking to train the model effectively. Remarkably, DSTEA\nonly requires pre-training without the direct infusion of extra knowledge into\nthe DST model. This approach resulted in substantial performance improvements\nof four robust DST models on MultiWOZ 2.0, 2.1, and 2.2, with joint goal\naccuracy witnessing an increase of up to 2.69% (from 52.41% to 55.10%). Further\nvalidation of DSTEA's efficacy was provided through comparative experiments\nconsidering various entity types and different entity adaptive pre-training\nconfigurations such as masking strategy and masking rate.\n","authors":["Yukyung Lee","Takyoung Kim","Hoonsang Yoon","Pilsung Kang","Junseong Bang","Misuk Kim"],"pdf_url":"https://arxiv.org/pdf/2207.03858v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12332v1","updated":"2023-07-23T13:58:00Z","published":"2023-07-23T13:58:00Z","title":"X-CapsNet For Fake News Detection","summary":"  News consumption has significantly increased with the growing popularity and\nuse of web-based forums and social media. This sets the stage for misinforming\nand confusing people. To help reduce the impact of misinformation on users'\npotential health-related decisions and other intents, it is desired to have\nmachine learning models to detect and combat fake news automatically. This\npaper proposes a novel transformer-based model using Capsule neural\nNetworks(CapsNet) called X-CapsNet. This model includes a CapsNet with dynamic\nrouting algorithm paralyzed with a size-based classifier for detecting short\nand long fake news statements. We use two size-based classifiers, a Deep\nConvolutional Neural Network (DCNN) for detecting long fake news statements and\na Multi-Layer Perceptron (MLP) for detecting short news statements. To resolve\nthe problem of representing short news statements, we use indirect features of\nnews created by concatenating the vector of news speaker profiles and a vector\nof polarity, sentiment, and counting words of news statements. For evaluating\nthe proposed architecture, we use the Covid-19 and the Liar datasets. The\nresults in terms of the F1-score for the Covid-19 dataset and accuracy for the\nLiar dataset show that models perform better than the state-of-the-art\nbaselines.\n","authors":["Mohammad Hadi Goldani","Reza Safabakhsh","Saeedeh Momtazi"],"pdf_url":"https://arxiv.org/pdf/2307.12332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10234v2","updated":"2023-07-23T13:48:15Z","published":"2023-07-16T05:33:35Z","title":"SentimentGPT: Exploiting GPT for Advanced Sentiment Analysis and its\n  Departure from Current Machine Learning","summary":"  This study presents a thorough examination of various Generative Pretrained\nTransformer (GPT) methodologies in sentiment analysis, specifically in the\ncontext of Task 4 on the SemEval 2017 dataset. Three primary strategies are\nemployed: 1) prompt engineering using the advanced GPT-3.5 Turbo, 2)\nfine-tuning GPT models, and 3) an inventive approach to embedding\nclassification. The research yields detailed comparative insights among these\nstrategies and individual GPT models, revealing their unique strengths and\npotential limitations. Additionally, the study compares these GPT-based\nmethodologies with other current, high-performing models previously used with\nthe same dataset. The results illustrate the significant superiority of the GPT\napproaches in terms of predictive performance, more than 22\\% in F1-score\ncompared to the state-of-the-art. Further, the paper sheds light on common\nchallenges in sentiment analysis tasks, such as understanding context and\ndetecting sarcasm. It underscores the enhanced capabilities of the GPT models\nto effectively handle these complexities. Taken together, these findings\nhighlight the promising potential of GPT models in sentiment analysis, setting\nthe stage for future research in this field. The code can be found at\nhttps://github.com/DSAatUSU/SentimentGPT\n","authors":["Kiana Kheiri","Hamid Karimi"],"pdf_url":"https://arxiv.org/pdf/2307.10234v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12282v1","updated":"2023-07-23T10:23:00Z","published":"2023-07-23T10:23:00Z","title":"Milimili. Collecting Parallel Data via Crowdsourcing","summary":"  We present a methodology for gathering a parallel corpus through\ncrowdsourcing, which is more cost-effective than hiring professional\ntranslators, albeit at the expense of quality. Additionally, we have made\navailable experimental parallel data collected for Chechen-Russian and\nFula-English language pairs.\n","authors":["Alexander Antonov"],"pdf_url":"https://arxiv.org/pdf/2307.12282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04827v2","updated":"2023-07-23T10:20:28Z","published":"2023-07-07T16:25:59Z","title":"LaunchpadGPT: Language Model as Music Visualization Designer on\n  Launchpad","summary":"  Launchpad is a musical instrument that allows users to create and perform\nmusic by pressing illuminated buttons. To assist and inspire the design of the\nLaunchpad light effect, and provide a more accessible approach for beginners to\ncreate music visualization with this instrument, we proposed the LaunchpadGPT\nmodel to generate music visualization designs on Launchpad automatically. Based\non the language model with excellent generation ability, our proposed\nLaunchpadGPT takes an audio piece of music as input and outputs the lighting\neffects of Launchpad-playing in the form of a video (Launchpad-playing video).\nWe collect Launchpad-playing videos and process them to obtain music and\ncorresponding video frame of Launchpad-playing as prompt-completion pairs, to\ntrain the language model. The experiment result shows the proposed method can\ncreate better music visualization than random generation methods and hold the\npotential for a broader range of music visualization applications. Our code is\navailable at https://github.com/yunlong10/LaunchpadGPT/.\n","authors":["Siting Xu","Yunlong Tang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.04827v2.pdf","comment":"Accepted by International Computer Music Conference (ICMC) 2023"},{"id":"http://arxiv.org/abs/2305.04492v8","updated":"2023-07-23T08:54:43Z","published":"2023-05-08T06:36:46Z","title":"MGR: Multi-generator Based Rationalization","summary":"  Rationalization is to employ a generator and a predictor to construct a\nself-explaining NLP model in which the generator selects a subset of\nhuman-intelligible pieces of the input text to the following predictor.\nHowever, rationalization suffers from two key challenges, i.e., spurious\ncorrelation and degeneration, where the predictor overfits the spurious or\nmeaningless pieces solely selected by the not-yet well-trained generator and in\nturn deteriorates the generator. Although many studies have been proposed to\naddress the two challenges, they are usually designed separately and do not\ntake both of them into account. In this paper, we propose a simple yet\neffective method named MGR to simultaneously solve the two problems. The key\nidea of MGR is to employ multiple generators such that the occurrence stability\nof real pieces is improved and more meaningful pieces are delivered to the\npredictor. Empirically, we show that MGR improves the F1 score by up to 20.9%\nas compared to state-of-the-art methods. Codes are available at\nhttps://github.com/jugechengzi/Rationalization-MGR .\n","authors":["Wei Liu","Haozhao Wang","Jun Wang","Ruixuan Li","Xinyang Li","Yuankai Zhang","Yang Qiu"],"pdf_url":"https://arxiv.org/pdf/2305.04492v8.pdf","comment":"ACL 2023, oral presentation. Fixed some typos and clarified some\n  implementation details. arXiv admin note: text overlap with arXiv:2209.08285"},{"id":"http://arxiv.org/abs/2307.12267v1","updated":"2023-07-23T08:47:51Z","published":"2023-07-23T08:47:51Z","title":"Towards Automatic Boundary Detection for Human-AI Hybrid Essay in\n  Education","summary":"  Human-AI collaborative writing has been greatly facilitated with the help of\nmodern large language models (LLM), e.g., ChatGPT. While admitting the\nconvenience brought by technology advancement, educators also have concerns\nthat students might leverage LLM to partially complete their writing assignment\nand pass off the human-AI hybrid text as their original work. Driven by such\nconcerns, in this study, we investigated the automatic detection of Human-AI\nhybrid text in education, where we formalized the hybrid text detection as a\nboundary detection problem, i.e., identifying the transition points between\nhuman-written content and AI-generated content. We constructed a hybrid essay\ndataset by partially removing sentences from the original student-written\nessays and then instructing ChatGPT to fill in for the incomplete essays. Then\nwe proposed a two-step detection approach where we (1) Separated AI-generated\ncontent from human-written content during the embedding learning process; and\n(2) Calculated the distances between every two adjacent prototypes (a prototype\nis the mean of a set of consecutive sentences from the hybrid text in the\nembedding space) and assumed that the boundaries exist between the two\nprototypes that have the furthest distance from each other. Through extensive\nexperiments, we summarized the following main findings: (1) The proposed\napproach consistently outperformed the baseline methods across different\nexperiment settings; (2) The embedding learning process (i.e., step 1) can\nsignificantly boost the performance of the proposed approach; (3) When\ndetecting boundaries for single-boundary hybrid essays, the performance of the\nproposed approach could be enhanced by adopting a relatively large prototype\nsize, leading to a $22$\\% improvement (against the second-best baseline method)\nin the in-domain setting and an $18$\\% improvement in the out-of-domain\nsetting.\n","authors":["Zijie Zeng","Lele Sha","Yuheng Li","Kaixun Yang","Dragan Gašević","Guanliang Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12267v1.pdf","comment":"9 pages including references, 2 figures"},{"id":"http://arxiv.org/abs/2307.12266v1","updated":"2023-07-23T08:42:05Z","published":"2023-07-23T08:42:05Z","title":"Transformer-based Joint Source Channel Coding for Textual Semantic\n  Communication","summary":"  The Space-Air-Ground-Sea integrated network calls for more robust and secure\ntransmission techniques against jamming. In this paper, we propose a textual\nsemantic transmission framework for robust transmission, which utilizes the\nadvanced natural language processing techniques to model and encode sentences.\nSpecifically, the textual sentences are firstly split into tokens using\nwordpiece algorithm, and are embedded to token vectors for semantic extraction\nby Transformer-based encoder. The encoded data are quantized to a fixed length\nbinary sequence for transmission, where binary erasure, symmetric, and deletion\nchannels are considered for transmission. The received binary sequences are\nfurther decoded by the transformer decoders into tokens used for sentence\nreconstruction. Our proposed approach leverages the power of neural networks\nand attention mechanism to provide reliable and efficient communication of\ntextual data in challenging wireless environments, and simulation results on\nsemantic similarity and bilingual evaluation understudy prove the superiority\nof the proposed model in semantic transmission.\n","authors":["Shicong Liu","Zhen Gao","Gaojie Chen","Yu Su","Lu Peng"],"pdf_url":"https://arxiv.org/pdf/2307.12266v1.pdf","comment":"6 pages, 5 figures. Accepted by IEEE/CIC ICCC 2023"},{"id":"http://arxiv.org/abs/2307.12262v1","updated":"2023-07-23T08:23:26Z","published":"2023-07-23T08:23:26Z","title":"A meta learning scheme for fast accent domain expansion in Mandarin\n  speech recognition","summary":"  Spoken languages show significant variation across mandarin and accent.\nDespite the high performance of mandarin automatic speech recognition (ASR),\naccent ASR is still a challenge task. In this paper, we introduce meta-learning\ntechniques for fast accent domain expansion in mandarin speech recognition,\nwhich expands the field of accents without deteriorating the performance of\nmandarin ASR. Meta-learning or learn-to-learn can learn general relation in\nmulti domains not only for over-fitting a specific domain. So we select\nmeta-learning in the domain expansion task. This more essential learning will\ncause improved performance on accent domain extension tasks. We combine the\nmethods of meta learning and freeze of model parameters, which makes the\nrecognition performance more stable in different cases and the training faster\nabout 20%. Our approach significantly outperforms other methods about 3%\nrelatively in the accent domain expansion task. Compared to the baseline model,\nit improves relatively 37% under the condition that the mandarin test set\nremains unchanged. In addition, it also proved this method to be effective on a\nlarge amount of data with a relative performance improvement of 4% on the\naccent test set.\n","authors":["Ziwei Zhu","Changhao Shan","Bihong Zhang","Jian Yu"],"pdf_url":"https://arxiv.org/pdf/2307.12262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12231v1","updated":"2023-07-23T05:39:39Z","published":"2023-07-23T05:39:39Z","title":"Exploring the Integration of Speech Separation and Recognition with\n  Self-Supervised Learning Representation","summary":"  Neural speech separation has made remarkable progress and its integration\nwith automatic speech recognition (ASR) is an important direction towards\nrealizing multi-speaker ASR. This work provides an insightful investigation of\nspeech separation in reverberant and noisy-reverberant scenarios as an ASR\nfront-end. In detail, we explore multi-channel separation methods, mask-based\nbeamforming and complex spectral mapping, as well as the best features to use\nin the ASR back-end model. We employ the recent self-supervised learning\nrepresentation (SSLR) as a feature and improve the recognition performance from\nthe case with filterbank features. To further improve multi-speaker recognition\nperformance, we present a carefully designed training strategy for integrating\nspeech separation and recognition with SSLR. The proposed integration using\nTF-GridNet-based complex spectral mapping and WavLM-based SSLR achieves a 2.5%\nword error rate in reverberant WHAMR! test set, significantly outperforming an\nexisting mask-based MVDR beamforming and filterbank integration (28.9%).\n","authors":["Yoshiki Masuyama","Xuankai Chang","Wangyou Zhang","Samuele Cornell","Zhong-Qiu Wang","Nobutaka Ono","Yanmin Qian","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2307.12231v1.pdf","comment":"Accepted to IEEE WASPAA 2023"},{"id":"http://arxiv.org/abs/2305.11435v2","updated":"2023-07-23T05:32:05Z","published":"2023-05-19T05:19:04Z","title":"Syllable Discovery and Cross-Lingual Generalization in a Visually\n  Grounded, Self-Supervised Speech Model","summary":"  In this paper, we show that representations capturing syllabic units emerge\nwhen training a self-supervised speech model with a visually-grounded training\nobjective. We demonstrate that a nearly identical model architecture (HuBERT)\ntrained with a masked language modeling loss does not exhibit this same\nability, suggesting that the visual grounding objective is responsible for the\nemergence of this phenomenon. We propose the use of a minimum cut algorithm to\nautomatically predict syllable boundaries in speech, followed by a 2-stage\nclustering method to group identical syllables together. We show that our model\nnot only outperforms a state-of-the-art syllabic segmentation method on the\nlanguage it was trained on (English), but also generalizes in a zero-shot\nfashion to Estonian. Finally, we show that the same model is capable of\nzero-shot generalization for a word segmentation task on 4 other languages from\nthe Zerospeech Challenge, in some cases beating the previous state-of-the-art.\n","authors":["Puyuan Peng","Shang-Wen Li","Okko Räsänen","Abdelrahman Mohamed","David Harwath"],"pdf_url":"https://arxiv.org/pdf/2305.11435v2.pdf","comment":"Interspeech 2023. Code & Model:\n  https://github.com/jasonppy/syllable-discovery"},{"id":"http://arxiv.org/abs/2301.11305v2","updated":"2023-07-23T04:18:36Z","published":"2023-01-26T18:44:06Z","title":"DetectGPT: Zero-Shot Machine-Generated Text Detection using Probability\n  Curvature","summary":"  The increasing fluency and widespread usage of large language models (LLMs)\nhighlight the desirability of corresponding tools aiding detection of\nLLM-generated text. In this paper, we identify a property of the structure of\nan LLM's probability function that is useful for such detection. Specifically,\nwe demonstrate that text sampled from an LLM tends to occupy negative curvature\nregions of the model's log probability function. Leveraging this observation,\nwe then define a new curvature-based criterion for judging if a passage is\ngenerated from a given LLM. This approach, which we call DetectGPT, does not\nrequire training a separate classifier, collecting a dataset of real or\ngenerated passages, or explicitly watermarking generated text. It uses only log\nprobabilities computed by the model of interest and random perturbations of the\npassage from another generic pre-trained language model (e.g., T5). We find\nDetectGPT is more discriminative than existing zero-shot methods for model\nsample detection, notably improving detection of fake news articles generated\nby 20B parameter GPT-NeoX from 0.81 AUROC for the strongest zero-shot baseline\nto 0.95 AUROC for DetectGPT. See https://ericmitchell.ai/detectgpt for code,\ndata, and other project information.\n","authors":["Eric Mitchell","Yoonho Lee","Alexander Khazatsky","Christopher D. Manning","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2301.11305v2.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2307.12221v1","updated":"2023-07-23T04:01:24Z","published":"2023-07-23T04:01:24Z","title":"FATRER: Full-Attention Topic Regularizer for Accurate and Robust\n  Conversational Emotion Recognition","summary":"  This paper concentrates on the understanding of interlocutors' emotions\nevoked in conversational utterances. Previous studies in this literature mainly\nfocus on more accurate emotional predictions, while ignoring model robustness\nwhen the local context is corrupted by adversarial attacks. To maintain\nrobustness while ensuring accuracy, we propose an emotion recognizer augmented\nby a full-attention topic regularizer, which enables an emotion-related global\nview when modeling the local context in a conversation. A joint topic modeling\nstrategy is introduced to implement regularization from both representation and\nloss perspectives. To avoid over-regularization, we drop the constraints on\nprior distributions that exist in traditional topic modeling and perform\nprobabilistic approximations based entirely on attention alignment. Experiments\nshow that our models obtain more favorable results than state-of-the-art\nmodels, and gain convincing robustness under three types of adversarial\nattacks.\n","authors":["Yuzhao Mao","Di Lu","Xiaojie Wang","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12221v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.12450v1","updated":"2023-07-23T22:48:07Z","published":"2023-07-23T22:48:07Z","title":"ProtoFL: Unsupervised Federated Learning via Prototypical Distillation","summary":"  Federated learning (FL) is a promising approach for enhancing data privacy\npreservation, particularly for authentication systems. However, limited round\ncommunications, scarce representation, and scalability pose significant\nchallenges to its deployment, hindering its full potential. In this paper, we\npropose 'ProtoFL', Prototypical Representation Distillation based unsupervised\nFederated Learning to enhance the representation power of a global model and\nreduce round communication costs. Additionally, we introduce a local one-class\nclassifier based on normalizing flows to improve performance with limited data.\nOur study represents the first investigation of using FL to improve one-class\nclassification performance. We conduct extensive experiments on five widely\nused benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and\nKeystroke-Dynamics, to demonstrate the superior performance of our proposed\nframework over previous methods in the literature.\n","authors":["Hansol Kim","Youngjun Kwak","Minyoung Jung","Jinho Shin","Youngsung Kim","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12450v1.pdf","comment":"Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed\n  equally to this work"},{"id":"http://arxiv.org/abs/2307.12442v1","updated":"2023-07-23T22:11:23Z","published":"2023-07-23T22:11:23Z","title":"EnTri: Ensemble Learning with Tri-level Representations for Explainable\n  Scene Recognition","summary":"  Scene recognition based on deep-learning has made significant progress, but\nthere are still limitations in its performance due to challenges posed by\ninter-class similarities and intra-class dissimilarities. Furthermore, prior\nresearch has primarily focused on improving classification accuracy, yet it has\ngiven less attention to achieving interpretable, precise scene classification.\nTherefore, we are motivated to propose EnTri, an ensemble scene recognition\nframework that employs ensemble learning using a hierarchy of visual features.\nEnTri represents features at three distinct levels of detail: pixel-level,\nsemantic segmentation-level, and object class and frequency level. By\nincorporating distinct feature encoding schemes of differing complexity and\nleveraging ensemble strategies, our approach aims to improve classification\naccuracy while enhancing transparency and interpretability via visual and\ntextual explanations. To achieve interpretability, we devised an extension\nalgorithm that generates both visual and textual explanations highlighting\nvarious properties of a given scene that contribute to the final prediction of\nits category. This includes information about objects, statistics, spatial\nlayout, and textural details. Through experiments on benchmark scene\nclassification datasets, EnTri has demonstrated superiority in terms of\nrecognition accuracy, achieving competitive performance compared to\nstate-of-the-art approaches, with an accuracy of 87.69%, 75.56%, and 99.17% on\nthe MIT67, SUN397, and UIUC8 datasets, respectively.\n","authors":["Amirhossein Aminimehr","Amirali Molaei","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2307.12442v1.pdf","comment":"Submitted to Pattern Recognition journal"},{"id":"http://arxiv.org/abs/2306.17843v2","updated":"2023-07-23T21:27:30Z","published":"2023-06-30T17:59:08Z","title":"Magic123: One Image to High-Quality 3D Object Generation Using Both 2D\n  and 3D Diffusion Priors","summary":"  We present Magic123, a two-stage coarse-to-fine approach for high-quality,\ntextured 3D meshes generation from a single unposed image in the wild using\nboth2D and 3D priors. In the first stage, we optimize a neural radiance field\nto produce a coarse geometry. In the second stage, we adopt a memory-efficient\ndifferentiable mesh representation to yield a high-resolution mesh with a\nvisually appealing texture. In both stages, the 3D content is learned through\nreference view supervision and novel views guided by a combination of 2D and 3D\ndiffusion priors. We introduce a single trade-off parameter between the 2D and\n3D priors to control exploration (more imaginative) and exploitation (more\nprecise) of the generated geometry. Additionally, we employ textual inversion\nand monocular depth regularization to encourage consistent appearances across\nviews and to prevent degenerate solutions, respectively. Magic123 demonstrates\na significant improvement over previous image-to-3D techniques, as validated\nthrough extensive experiments on synthetic benchmarks and diverse real-world\nimages. Our code, models, and generated 3D assets are available at\nhttps://github.com/guochengqian/Magic123.\n","authors":["Guocheng Qian","Jinjie Mai","Abdullah Hamdi","Jian Ren","Aliaksandr Siarohin","Bing Li","Hsin-Ying Lee","Ivan Skorokhodov","Peter Wonka","Sergey Tulyakov","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2306.17843v2.pdf","comment":"webpage: https://guochengqian.github.io/project/magic123/"},{"id":"http://arxiv.org/abs/2307.12429v1","updated":"2023-07-23T20:55:11Z","published":"2023-07-23T20:55:11Z","title":"SwIPE: Efficient and Robust Medical Image Segmentation with Implicit\n  Patch Embeddings","summary":"  Modern medical image segmentation methods primarily use discrete\nrepresentations in the form of rasterized masks to learn features and generate\npredictions. Although effective, this paradigm is spatially inflexible, scales\npoorly to higher-resolution images, and lacks direct understanding of object\nshapes. To address these limitations, some recent works utilized implicit\nneural representations (INRs) to learn continuous representations for\nsegmentation. However, these methods often directly adopted components designed\nfor 3D shape reconstruction. More importantly, these formulations were also\nconstrained to either point-based or global contexts, lacking contextual\nunderstanding or local fine-grained details, respectively--both critical for\naccurate segmentation. To remedy this, we propose a novel approach, SwIPE\n(Segmentation with Implicit Patch Embeddings), that leverages the advantages of\nINRs and predicts shapes at the patch level--rather than at the point level or\nimage level--to enable both accurate local boundary delineation and global\nshape coherence. Extensive evaluations on two tasks (2D polyp segmentation and\n3D abdominal organ segmentation) show that SwIPE significantly improves over\nrecent implicit approaches and outperforms state-of-the-art discrete methods\nwith over 10x fewer parameters. Our method also demonstrates superior data\nefficiency and improved robustness to data shifts across image resolutions and\ndatasets. Code is available on Github.\n","authors":["Yejia Zhang","Pengfei Gu","Nishchal Sapkota","Danny Z. Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12429v1.pdf","comment":"Accepted to 2023 International Conference on Medical Image Computing\n  and Computer Assisted Intervention (MICCAI'23)"},{"id":"http://arxiv.org/abs/2307.12427v1","updated":"2023-07-23T20:47:03Z","published":"2023-07-23T20:47:03Z","title":"Augmented Box Replay: Overcoming Foreground Shift for Incremental Object\n  Detection","summary":"  In incremental learning, replaying stored samples from previous tasks\ntogether with current task samples is one of the most efficient approaches to\naddress catastrophic forgetting. However, unlike incremental classification,\nimage replay has not been successfully applied to incremental object detection\n(IOD). In this paper, we identify the overlooked problem of foreground shift as\nthe main reason for this. Foreground shift only occurs when replaying images of\nprevious tasks and refers to the fact that their background might contain\nforeground objects of the current task. To overcome this problem, a novel and\nefficient Augmented Box Replay (ABR) method is developed that only stores and\nreplays foreground objects and thereby circumvents the foreground shift\nproblem. In addition, we propose an innovative Attentive RoI Distillation loss\nthat uses spatial attention from region-of-interest (RoI) features to constrain\ncurrent model to focus on the most important information from old model. ABR\nsignificantly reduces forgetting of previous classes while maintaining high\nplasticity in current classes. Moreover, it considerably reduces the storage\nrequirements when compared to standard image replay. Comprehensive experiments\non Pascal-VOC and COCO datasets support the state-of-the-art performance of our\nmodel.\n","authors":["Liu Yuyang","Cong Yang","Goswami Dipam","Liu Xialei","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2307.12427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10373v2","updated":"2023-07-23T18:56:06Z","published":"2023-07-19T18:00:03Z","title":"TokenFlow: Consistent Diffusion Features for Consistent Video Editing","summary":"  The generative AI revolution has recently expanded to videos. Nevertheless,\ncurrent state-of-the-art video models are still lagging behind image models in\nterms of visual quality and user control over the generated content. In this\nwork, we present a framework that harnesses the power of a text-to-image\ndiffusion model for the task of text-driven video editing. Specifically, given\na source video and a target text-prompt, our method generates a high-quality\nvideo that adheres to the target text, while preserving the spatial layout and\nmotion of the input video. Our method is based on a key observation that\nconsistency in the edited video can be obtained by enforcing consistency in the\ndiffusion feature space. We achieve this by explicitly propagating diffusion\nfeatures based on inter-frame correspondences, readily available in the model.\nThus, our framework does not require any training or fine-tuning, and can work\nin conjunction with any off-the-shelf text-to-image editing method. We\ndemonstrate state-of-the-art editing results on a variety of real-world videos.\nWebpage: https://diffusion-tokenflow.github.io/\n","authors":["Michal Geyer","Omer Bar-Tal","Shai Bagon","Tali Dekel"],"pdf_url":"https://arxiv.org/pdf/2307.10373v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12400v1","updated":"2023-07-23T18:38:42Z","published":"2023-07-23T18:38:42Z","title":"TransNet: Transparent Object Manipulation Through Category-Level Pose\n  Estimation","summary":"  Transparent objects present multiple distinct challenges to visual perception\nsystems. First, their lack of distinguishing visual features makes transparent\nobjects harder to detect and localize than opaque objects. Even humans find\ncertain transparent surfaces with little specular reflection or refraction,\nlike glass doors, difficult to perceive. A second challenge is that depth\nsensors typically used for opaque object perception cannot obtain accurate\ndepth measurements on transparent surfaces due to their unique reflective\nproperties. Stemming from these challenges, we observe that transparent object\ninstances within the same category, such as cups, look more similar to each\nother than to ordinary opaque objects of that same category. Given this\nobservation, the present paper explores the possibility of category-level\ntransparent object pose estimation rather than instance-level pose estimation.\nWe propose \\textit{\\textbf{TransNet}}, a two-stage pipeline that estimates\ncategory-level transparent object pose using localized depth completion and\nsurface normal estimation. TransNet is evaluated in terms of pose estimation\naccuracy on a large-scale transparent object dataset and compared to a\nstate-of-the-art category-level pose estimation approach. Results from this\ncomparison demonstrate that TransNet achieves improved pose estimation accuracy\non transparent objects. Moreover, we use TransNet to build an autonomous\ntransparent object manipulation system for robotic pick-and-place and pouring\ntasks.\n","authors":["Huijie Zhang","Anthony Opipari","Xiaotong Chen","Jiyue Zhu","Zeren Yu","Odest Chadwicke Jenkins"],"pdf_url":"https://arxiv.org/pdf/2307.12400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12392v1","updated":"2023-07-23T17:55:24Z","published":"2023-07-23T17:55:24Z","title":"Iterative Robust Visual Grounding with Masked Reference based\n  Centerpoint Supervision","summary":"  Visual Grounding (VG) aims at localizing target objects from an image based\non given expressions and has made significant progress with the development of\ndetection and vision transformer. However, existing VG methods tend to generate\nfalse-alarm objects when presented with inaccurate or irrelevant descriptions,\nwhich commonly occur in practical applications. Moreover, existing methods fail\nto capture fine-grained features, accurate localization, and sufficient context\ncomprehension from the whole image and textual descriptions. To address both\nissues, we propose an Iterative Robust Visual Grounding (IR-VG) framework with\nMasked Reference based Centerpoint Supervision (MRCS). The framework introduces\niterative multi-level vision-language fusion (IMVF) for better alignment. We\nuse MRCS to ahieve more accurate localization with point-wised feature\nsupervision. Then, to improve the robustness of VG, we also present a\nmulti-stage false-alarm sensitive decoder (MFSD) to prevent the generation of\nfalse-alarm objects when presented with inaccurate expressions. The proposed\nframework is evaluated on five regular VG datasets and two newly constructed\nrobust VG datasets. Extensive experiments demonstrate that IR-VG achieves new\nstate-of-the-art (SOTA) results, with improvements of 25\\% and 10\\% compared to\nexisting SOTA approaches on the two newly proposed robust VG datasets.\nMoreover, the proposed framework is also verified effective on five regular VG\ndatasets. Codes and models will be publicly at\nhttps://github.com/cv516Buaa/IR-VG.\n","authors":["Menghao Li","Chunlei Wang","Wenquan Feng","Shuchang Lyu","Guangliang Cheng","Xiangtai Li","Binghao Liu","Qi Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.12392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05376v3","updated":"2023-07-23T16:30:27Z","published":"2022-12-10T23:48:27Z","title":"What's Wrong with the Absolute Trajectory Error?","summary":"  One of the limitations of the commonly used Absolute Trajectory Error (ATE)\nis that it is highly sensitive to outliers. As a result, in the presence of\njust a few outliers, it often fails to reflect the varying accuracy as the\ninlier trajectory error or the number of outliers varies. In this work, we\npropose an alternative error metric for evaluating the accuracy of the\nreconstructed camera trajectory. Our metric, named Discernible Trajectory Error\n(DTE), is computed in five steps: (1) Shift the ground-truth and estimated\ntrajectories such that both of their geometric medians are located at the\norigin. (2) Rotate the estimated trajectory such that it minimizes the sum of\ngeodesic distances between the corresponding camera orientations. (3) Scale the\nestimated trajectory such that the median distance of the cameras to their\ngeometric median is the same as that of the ground truth. (4) Compute,\nwinsorize and normalize the distances between the corresponding cameras. (5)\nObtain the DTE by taking the average of the mean and the root-mean-square (RMS)\nof the resulting distances. This metric is an attractive alternative to the\nATE, in that it is capable of discerning the varying trajectory accuracy as the\ninlier trajectory error or the number of outliers varies. Using the similar\nidea, we also propose a novel rotation error metric, named Discernible Rotation\nError (DRE), which has similar advantages to the DTE. Furthermore, we propose a\nsimple yet effective method for calibrating the camera-to-marker rotation,\nwhich is needed for the computation of our metrics. Our methods are verified\nthrough extensive simulations.\n","authors":["Seong Hun Lee","Javier Civera"],"pdf_url":"https://arxiv.org/pdf/2212.05376v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12349v1","updated":"2023-07-23T15:17:45Z","published":"2023-07-23T15:17:45Z","title":"ComPtr: Towards Diverse Bi-source Dense Prediction Tasks via A Simple\n  yet General Complementary Transformer","summary":"  Deep learning (DL) has advanced the field of dense prediction, while\ngradually dissolving the inherent barriers between different tasks. However,\nmost existing works focus on designing architectures and constructing visual\ncues only for the specific task, which ignores the potential uniformity\nintroduced by the DL paradigm. In this paper, we attempt to construct a novel\n\\underline{ComP}lementary \\underline{tr}ansformer, \\textbf{ComPtr}, for diverse\nbi-source dense prediction tasks. Specifically, unlike existing methods that\nover-specialize in a single task or a subset of tasks, ComPtr starts from the\nmore general concept of bi-source dense prediction. Based on the basic\ndependence on information complementarity, we propose consistency enhancement\nand difference awareness components with which ComPtr can evacuate and collect\nimportant visual semantic cues from different image sources for diverse tasks,\nrespectively. ComPtr treats different inputs equally and builds an efficient\ndense interaction model in the form of sequence-to-sequence on top of the\ntransformer. This task-generic design provides a smooth foundation for\nconstructing the unified model that can simultaneously deal with various\nbi-source information. In extensive experiments across several representative\nvision tasks, i.e. remote sensing change detection, RGB-T crowd counting,\nRGB-D/T salient object detection, and RGB-D semantic segmentation, the proposed\nmethod consistently obtains favorable performance. The code will be available\nat \\url{https://github.com/lartpang/ComPtr}.\n","authors":["Youwei Pang","Xiaoqi Zhao","Lihe Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2307.12349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05303v2","updated":"2023-07-23T15:11:31Z","published":"2023-04-11T15:54:25Z","title":"ELVIS: Empowering Locality of Vision Language Pre-training with\n  Intra-modal Similarity","summary":"  Deep learning has shown great potential in assisting radiologists in reading\nchest X-ray (CXR) images, but its need for expensive annotations for improving\nperformance prevents widespread clinical application. Visual language\npre-training (VLP) can alleviate the burden and cost of annotation by\nleveraging routinely generated reports for radiographs, which exist in large\nquantities as well as in paired form (image-text pairs). Additionally,\nextensions to localization-aware VLPs are being proposed to address the needs\nfor accurate localization of abnormalities for computer-aided diagnosis (CAD)\nin CXR. However, we find that the formulation proposed by locality-aware VLP\nliterature actually leads to a loss in spatial relationships required for\ndownstream localization tasks. Therefore, we propose Empowering Locality of VLP\nwith Intra-modal Similarity, ELVIS, a VLP aware of intra-modal locality, to\nbetter preserve the locality within radiographs or reports, which enhances the\nability to comprehend location references in text reports. Our locality-aware\nVLP method significantly outperforms state-of-the art baselines in multiple\nsegmentation tasks and the MS-CXR phrase grounding task. Qualitatively, we show\nthat ELVIS focuses well on regions of interest described in the report text\ncompared to prior approaches, allowing for enhanced interpretability.\n","authors":["Sumin Seo","JaeWoong Shin","Jaewoo Kang","Tae Soo Kim","Thijs Kooi"],"pdf_url":"https://arxiv.org/pdf/2304.05303v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2307.12348v1","updated":"2023-07-23T15:10:02Z","published":"2023-07-23T15:10:02Z","title":"ResShift: Efficient Diffusion Model for Image Super-resolution by\n  Residual Shifting","summary":"  Diffusion-based image super-resolution (SR) methods are mainly limited by the\nlow inference speed due to the requirements of hundreds or even thousands of\nsampling steps. Existing acceleration sampling techniques inevitably sacrifice\nperformance to some extent, leading to over-blurry SR results. To address this\nissue, we propose a novel and efficient diffusion model for SR that\nsignificantly reduces the number of diffusion steps, thereby eliminating the\nneed for post-acceleration during inference and its associated performance\ndeterioration. Our method constructs a Markov chain that transfers between the\nhigh-resolution image and the low-resolution image by shifting the residual\nbetween them, substantially improving the transition efficiency. Additionally,\nan elaborate noise schedule is developed to flexibly control the shifting speed\nand the noise strength during the diffusion process. Extensive experiments\ndemonstrate that the proposed method obtains superior or at least comparable\nperformance to current state-of-the-art methods on both synthetic and\nreal-world datasets, even only with 15 sampling steps. Our code and model are\navailable at https://github.com/zsyOAOA/ResShift.\n","authors":["Zongsheng Yue","Jianyi Wang","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2307.12348v1.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.12344v1","updated":"2023-07-23T14:43:17Z","published":"2023-07-23T14:43:17Z","title":"Right for the Wrong Reason: Can Interpretable ML Techniques Detect\n  Spurious Correlations?","summary":"  While deep neural network models offer unmatched classification performance,\nthey are prone to learning spurious correlations in the data. Such dependencies\non confounding information can be difficult to detect using performance metrics\nif the test data comes from the same distribution as the training data.\nInterpretable ML methods such as post-hoc explanations or inherently\ninterpretable classifiers promise to identify faulty model reasoning. However,\nthere is mixed evidence whether many of these techniques are actually able to\ndo so. In this paper, we propose a rigorous evaluation strategy to assess an\nexplanation technique's ability to correctly identify spurious correlations.\nUsing this strategy, we evaluate five post-hoc explanation techniques and one\ninherently interpretable method for their ability to detect three types of\nartificially added confounders in a chest x-ray diagnosis task. We find that\nthe post-hoc technique SHAP, as well as the inherently interpretable Attri-Net\nprovide the best performance and can be used to reliably identify faulty model\nbehavior.\n","authors":["Susu Sun","Lisa M. Koch","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2307.12344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12342v1","updated":"2023-07-23T14:37:13Z","published":"2023-07-23T14:37:13Z","title":"Towards Generic and Controllable Attacks Against Object Detection","summary":"  Existing adversarial attacks against Object Detectors (ODs) suffer from two\ninherent limitations. Firstly, ODs have complicated meta-structure designs,\nhence most advanced attacks for ODs concentrate on attacking specific\ndetector-intrinsic structures, which makes it hard for them to work on other\ndetectors and motivates us to design a generic attack against ODs. Secondly,\nmost works against ODs make Adversarial Examples (AEs) by generalizing\nimage-level attacks from classification to detection, which brings redundant\ncomputations and perturbations in semantically meaningless areas (e.g.,\nbackgrounds) and leads to an emergency for seeking controllable attacks for\nODs. To this end, we propose a generic white-box attack, LGP (local\nperturbations with adaptively global attacks), to blind mainstream object\ndetectors with controllable perturbations. For a detector-agnostic attack, LGP\ntracks high-quality proposals and optimizes three heterogeneous losses\nsimultaneously. In this way, we can fool the crucial components of ODs with a\npart of their outputs without the limitations of specific structures. Regarding\ncontrollability, we establish an object-wise constraint that exploits\nforeground-background separation adaptively to induce the attachment of\nperturbations to foregrounds. Experimentally, the proposed LGP successfully\nattacked sixteen state-of-the-art object detectors on MS-COCO and DOTA\ndatasets, with promising imperceptibility and transferability obtained. Codes\nare publicly released in https://github.com/liguopeng0923/LGP.git\n","authors":["Guopeng Li","Yue Xu","Jian Ding","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2307.12342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12341v1","updated":"2023-07-23T14:32:07Z","published":"2023-07-23T14:32:07Z","title":"Rapid detection of soil carbonates by means of NIR spectroscopy, deep\n  learning methods and phase quantification by powder Xray diffraction","summary":"  Soil NIR spectral absorbance/reflectance libraries are utilized towards\nimproving agricultural production and analysis of soil properties which are key\nprerequisite for agroecological balance and environmental sustainability.\nCarbonates in particular, represent a soil property which is mostly affected\neven by mild, let alone extreme, changes of environmental conditions during\nclimate change. In this study we propose a rapid and efficient way to predict\ncarbonates content in soil by means of FT NIR reflectance spectroscopy and by\nuse of deep learning methods. We exploited multiple machine learning methods,\nsuch as: 1) a MLP Regressor and 2) a CNN and compare their performance with\nother traditional ML algorithms such as PLSR, Cubist and SVM on the combined\ndataset of two NIR spectral libraries: KSSL (USDA), a dataset of soil samples\nreflectance spectra collected nationwide, and LUCAS TopSoil (European Soil\nLibrary) which contains soil sample absorbance spectra from all over the\nEuropean Union, and use them to predict carbonate content on never before seen\nsoil samples. Soil samples in KSSL and in TopSoil spectral libraries were\nacquired in the spectral region of visNIR, however in this study, only the NIR\nspectral region was utilized. Quantification of carbonates by means of Xray\nDiffraction is in good agreement with the volumetric method and the MLP\nprediction. Our work contributes to rapid carbonates content prediction in soil\nsamples in cases where: 1) no volumetric method is available and 2) only NIR\nspectra absorbance data are available. Up till now and to the best of our\nknowledge, there exists no other study, that presents a prediction model\ntrained on such an extensive dataset with such promising results on unseen\ndata, undoubtedly supporting the notion that deep learning models present\nexcellent prediction tools for soil carbonates content.\n","authors":["Lykourgos Chiniadis","Petros Tamvakis"],"pdf_url":"https://arxiv.org/pdf/2307.12341v1.pdf","comment":"39 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.12335v1","updated":"2023-07-23T14:01:05Z","published":"2023-07-23T14:01:05Z","title":"Learning Navigational Visual Representations with Semantic Map\n  Supervision","summary":"  Being able to perceive the semantics and the spatial structure of the\nenvironment is essential for visual navigation of a household robot. However,\nmost existing works only employ visual backbones pre-trained either with\nindependent images for classification or with self-supervised learning methods\nto adapt to the indoor navigation domain, neglecting the spatial relationships\nthat are essential to the learning of navigation. Inspired by the behavior that\nhumans naturally build semantically and spatially meaningful cognitive maps in\ntheir brains during navigation, in this paper, we propose a novel\nnavigational-specific visual representation learning method by contrasting the\nagent's egocentric views and semantic maps (Ego$^2$-Map). We apply the visual\ntransformer as the backbone encoder and train the model with data collected\nfrom the large-scale Habitat-Matterport3D environments. Ego$^2$-Map learning\ntransfers the compact and rich information from a map, such as objects,\nstructure and transition, to the agent's egocentric representations for\nnavigation. Experiments show that agents using our learned representations on\nobject-goal navigation outperform recent visual pre-training methods. Moreover,\nour representations significantly improve vision-and-language navigation in\ncontinuous environments for both high-level and low-level action spaces,\nachieving new state-of-the-art results of 47% SR and 41% SPL on the test\nserver.\n","authors":["Yicong Hong","Yang Zhou","Ruiyi Zhang","Franck Dernoncourt","Trung Bui","Stephen Gould","Hao Tan"],"pdf_url":"https://arxiv.org/pdf/2307.12335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07464v2","updated":"2023-07-23T13:51:34Z","published":"2023-01-18T12:16:19Z","title":"CLIPTER: Looking at the Bigger Picture in Scene Text Recognition","summary":"  Reading text in real-world scenarios often requires understanding the context\nsurrounding it, especially when dealing with poor-quality text. However,\ncurrent scene text recognizers are unaware of the bigger picture as they\noperate on cropped text images. In this study, we harness the representative\ncapabilities of modern vision-language models, such as CLIP, to provide\nscene-level information to the crop-based recognizer. We achieve this by fusing\na rich representation of the entire image, obtained from the vision-language\nmodel, with the recognizer word-level features via a gated cross-attention\nmechanism. This component gradually shifts to the context-enhanced\nrepresentation, allowing for stable fine-tuning of a pretrained recognizer. We\ndemonstrate the effectiveness of our model-agnostic framework, CLIPTER (CLIP\nTExt Recognition), on leading text recognition architectures and achieve\nstate-of-the-art results across multiple benchmarks. Furthermore, our analysis\nhighlights improved robustness to out-of-vocabulary words and enhanced\ngeneralization in low-data regimes.\n","authors":["Aviad Aberdam","David Bensaïd","Alona Golts","Roy Ganz","Oren Nuriel","Royee Tichauer","Shai Mazor","Ron Litman"],"pdf_url":"https://arxiv.org/pdf/2301.07464v2.pdf","comment":"Accepted for publication by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12327v1","updated":"2023-07-23T13:50:41Z","published":"2023-07-23T13:50:41Z","title":"ES2Net: An Efficient Spectral-Spatial Network for Hyperspectral Image\n  Change Detection","summary":"  Hyperspectral image change detection (HSI-CD) aims to identify the\ndifferences in bitemporal HSIs. To mitigate spectral redundancy and improve the\ndiscriminativeness of changing features, some methods introduced band selection\ntechnology to select bands conducive for CD. However, these methods are limited\nby the inability to end-to-end training with the deep learning-based feature\nextractor and lack considering the complex nonlinear relationship among bands.\nIn this paper, we propose an end-to-end efficient spectral-spatial change\ndetection network (ES2Net) to address these issues. Specifically, we devised a\nlearnable band selection module to automatically select bands conducive to CD.\nIt can be jointly optimized with a feature extraction network and capture the\ncomplex nonlinear relationships among bands. Moreover, considering the large\nspatial feature distribution differences among different bands, we design the\ncluster-wise spatial attention mechanism that assigns a spatial attention\nfactor to each individual band to individually improve the feature\ndiscriminativeness for each band. Experiments on three widely used HSI-CD\ndatasets demonstrate the effectiveness and superiority of this method compared\nwith other state-of-the-art methods.\n","authors":["Qingren Yao","Yuan Zhou","Wei Xiang"],"pdf_url":"https://arxiv.org/pdf/2307.12327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09283v2","updated":"2023-07-23T13:33:51Z","published":"2023-07-18T14:24:33Z","title":"RepViT: Revisiting Mobile CNN From ViT Perspective","summary":"  Recently, lightweight Vision Transformers (ViTs) demonstrate superior\nperformance and lower latency compared with lightweight Convolutional Neural\nNetworks (CNNs) on resource-constrained mobile devices. This improvement is\nusually attributed to the multi-head self-attention module, which enables the\nmodel to learn global representations. However, the architectural disparities\nbetween lightweight ViTs and lightweight CNNs have not been adequately\nexamined. In this study, we revisit the efficient design of lightweight CNNs\nand emphasize their potential for mobile devices. We incrementally enhance the\nmobile-friendliness of a standard lightweight CNN, specifically MobileNetV3, by\nintegrating the efficient architectural choices of lightweight ViTs. This ends\nup with a new family of pure lightweight CNNs, namely RepViT. Extensive\nexperiments show that RepViT outperforms existing state-of-the-art lightweight\nViTs and exhibits favorable latency in various vision tasks. On ImageNet,\nRepViT achieves over 80\\% top-1 accuracy with nearly 1ms latency on an iPhone\n12, which is the first time for a lightweight model, to the best of our\nknowledge. Our largest model, RepViT-M3, obtains 81.4\\% accuracy with only\n1.3ms latency. The code and trained models are available at\n\\url{https://github.com/jameslahm/RepViT}.\n","authors":["Ao Wang","Hui Chen","Zijia Lin","Hengjun Pu","Guiguang Ding"],"pdf_url":"https://arxiv.org/pdf/2307.09283v2.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.12316v1","updated":"2023-07-23T12:57:47Z","published":"2023-07-23T12:57:47Z","title":"Development of pericardial fat count images using a combination of three\n  different deep-learning models","summary":"  Rationale and Objectives: Pericardial fat (PF), the thoracic visceral fat\nsurrounding the heart, promotes the development of coronary artery disease by\ninducing inflammation of the coronary arteries. For evaluating PF, this study\naimed to generate pericardial fat count images (PFCIs) from chest radiographs\n(CXRs) using a dedicated deep-learning model.\n  Materials and Methods: The data of 269 consecutive patients who underwent\ncoronary computed tomography (CT) were reviewed. Patients with metal implants,\npleural effusion, history of thoracic surgery, or that of malignancy were\nexcluded. Thus, the data of 191 patients were used. PFCIs were generated from\nthe projection of three-dimensional CT images, where fat accumulation was\nrepresented by a high pixel value. Three different deep-learning models,\nincluding CycleGAN, were combined in the proposed method to generate PFCIs from\nCXRs. A single CycleGAN-based model was used to generate PFCIs from CXRs for\ncomparison with the proposed method. To evaluate the image quality of the\ngenerated PFCIs, structural similarity index measure (SSIM), mean squared error\n(MSE), and mean absolute error (MAE) of (i) the PFCI generated using the\nproposed method and (ii) the PFCI generated using the single model were\ncompared.\n  Results: The mean SSIM, MSE, and MAE were as follows: 0.856, 0.0128, and\n0.0357, respectively, for the proposed model; and 0.762, 0.0198, and 0.0504,\nrespectively, for the single CycleGAN-based model.\n  Conclusion: PFCIs generated from CXRs with the proposed model showed better\nperformance than those with the single model. PFCI evaluation without CT may be\npossible with the proposed method.\n","authors":["Takaaki Matsunaga","Atsushi Kono","Hidetoshi Matsuo","Kaoru Kitagawab","Mizuho Nishio","Hiromi Hashimura","Yu Izawa","Takayoshi Toba","Kazuki Ishikawab","Akie Katsuki","Kazuyuki Ohmura","Takamichi Murakami"],"pdf_url":"https://arxiv.org/pdf/2307.12316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12309v1","updated":"2023-07-23T12:42:15Z","published":"2023-07-23T12:42:15Z","title":"Building Extraction from Remote Sensing Images via an Uncertainty-Aware\n  Network","summary":"  Building extraction aims to segment building pixels from remote sensing\nimages and plays an essential role in many applications, such as city planning\nand urban dynamic monitoring. Over the past few years, deep learning methods\nwith encoder-decoder architectures have achieved remarkable performance due to\ntheir powerful feature representation capability. Nevertheless, due to the\nvarying scales and styles of buildings, conventional deep learning models\nalways suffer from uncertain predictions and cannot accurately distinguish the\ncomplete footprints of the building from the complex distribution of ground\nobjects, leading to a large degree of omission and commission. In this paper,\nwe realize the importance of uncertain prediction and propose a novel and\nstraightforward Uncertainty-Aware Network (UANet) to alleviate this problem. To\nverify the performance of our proposed UANet, we conduct extensive experiments\non three public building datasets, including the WHU building dataset, the\nMassachusetts building dataset, and the Inria aerial image dataset. Results\ndemonstrate that the proposed UANet outperforms other state-of-the-art\nalgorithms by a large margin.\n","authors":["Wei He","Jiepan Li","Weinan Cao","Liangpei Zhang","Hongyan Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12301v1","updated":"2023-07-23T11:50:27Z","published":"2023-07-23T11:50:27Z","title":"RANSAC-NN: Unsupervised Image Outlier Detection using RANSAC","summary":"  Image outlier detection (OD) is crucial for ensuring the quality and accuracy\nof image datasets used in computer vision tasks. The majority of OD algorithms,\nhowever, have not been targeted toward image data. Consequently, the results of\napplying such algorithms to images are often suboptimal. In this work, we\npropose RANSAC-NN, a novel unsupervised OD algorithm specifically designed for\nimages. By comparing images in a RANSAC-based approach, our algorithm\nautomatically predicts the outlier score of each image without additional\ntraining or label information. We evaluate RANSAC-NN against state-of-the-art\nOD algorithms on 15 diverse datasets. Without any hyperparameter tuning,\nRANSAC-NN consistently performs favorably in contrast to other algorithms in\nalmost every dataset category. Furthermore, we provide a detailed analysis to\nunderstand each RANSAC-NN component, and we demonstrate its potential\napplications in image mislabeled detection. Code for RANSAC-NN is provided at\nhttps://github.com/mxtsai/ransac-nn\n","authors":["Chen-Han Tsai","Yu-Shao Peng"],"pdf_url":"https://arxiv.org/pdf/2307.12301v1.pdf","comment":"19 pages, 18 figures"},{"id":"http://arxiv.org/abs/2307.12299v1","updated":"2023-07-23T11:32:14Z","published":"2023-07-23T11:32:14Z","title":"Hybrid-CSR: Coupling Explicit and Implicit Shape Representation for\n  Cortical Surface Reconstruction","summary":"  We present Hybrid-CSR, a geometric deep-learning model that combines explicit\nand implicit shape representations for cortical surface reconstruction.\nSpecifically, Hybrid-CSR begins with explicit deformations of template meshes\nto obtain coarsely reconstructed cortical surfaces, based on which the oriented\npoint clouds are estimated for the subsequent differentiable poisson surface\nreconstruction. By doing so, our method unifies explicit (oriented point\nclouds) and implicit (indicator function) cortical surface reconstruction.\nCompared to explicit representation-based methods, our hybrid approach is more\nfriendly to capture detailed structures, and when compared with implicit\nrepresentation-based methods, our method can be topology aware because of\nend-to-end training with a mesh-based deformation module. In order to address\ntopology defects, we propose a new topology correction pipeline that relies on\noptimization-based diffeomorphic surface registration. Experimental results on\nthree brain datasets show that our approach surpasses existing implicit and\nexplicit cortical surface reconstruction methods in numeric metrics in terms of\naccuracy, regularity, and consistency.\n","authors":["Shanlin Sun","Thanh-Tung Le","Chenyu You","Hao Tang","Kun Han","Haoyu Ma","Deying Kong","Xiangyi Yan","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2307.12299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12297v1","updated":"2023-07-23T11:28:25Z","published":"2023-07-23T11:28:25Z","title":"Simultaneous temperature estimation and nonuniformity correction from\n  multiple frames","summary":"  Infrared (IR) cameras are widely used for temperature measurements in various\napplications, including agriculture, medicine, and security. Low-cost IR camera\nhave an immense potential to replace expansive radiometric cameras in these\napplications, however low-cost microbolometer-based IR cameras are prone to\nspatially-variant nonuniformity and to drift in temperature measurements, which\nlimits their usability in practical scenarios.\n  To address these limitations, we propose a novel approach for simultaneous\ntemperature estimation and nonuniformity correction from multiple frames\ncaptured by low-cost microbolometer-based IR cameras. We leverage the physical\nimage acquisition model of the camera and incorporate it into a deep learning\narchitecture called kernel estimation networks (KPN), which enables us to\ncombine multiple frames despite imperfect registration between them. We also\npropose a novel offset block that incorporates the ambient temperature into the\nmodel and enables us to estimate the offset of the camera, which is a key\nfactor in temperature estimation.\n  Our findings demonstrate that the number of frames has a significant impact\non the accuracy of temperature estimation and nonuniformity correction.\nMoreover, our approach achieves a significant improvement in performance\ncompared to vanilla KPN, thanks to the offset block. The method was tested on\nreal data collected by a low-cost IR camera mounted on a UAV, showing only a\nsmall average error of $0.27^\\circ C-0.54^\\circ C$ relative to costly\nscientific-grade radiometric cameras.\n  Our method provides an accurate and efficient solution for simultaneous\ntemperature estimation and nonuniformity correction, which has important\nimplications for a wide range of practical applications.\n","authors":["Navot Oz","Omri Berman","Nir Sochen","David Mendelovich","Iftach Klapp"],"pdf_url":"https://arxiv.org/pdf/2307.12297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12291v1","updated":"2023-07-23T10:59:51Z","published":"2023-07-23T10:59:51Z","title":"TransHuman: A Transformer-based Human Representation for Generalizable\n  Neural Human Rendering","summary":"  In this paper, we focus on the task of generalizable neural human rendering\nwhich trains conditional Neural Radiance Fields (NeRF) from multi-view videos\nof different characters. To handle the dynamic human motion, previous methods\nhave primarily used a SparseConvNet (SPC)-based human representation to process\nthe painted SMPL. However, such SPC-based representation i) optimizes under the\nvolatile observation space which leads to the pose-misalignment between\ntraining and inference stages, and ii) lacks the global relationships among\nhuman parts that is critical for handling the incomplete painted SMPL. Tackling\nthese issues, we present a brand-new framework named TransHuman, which learns\nthe painted SMPL under the canonical space and captures the global\nrelationships between human parts with transformers. Specifically, TransHuman\nis mainly composed of Transformer-based Human Encoding (TransHE), Deformable\nPartial Radiance Fields (DPaRF), and Fine-grained Detail Integration (FDI).\nTransHE first processes the painted SMPL under the canonical space via\ntransformers for capturing the global relationships between human parts. Then,\nDPaRF binds each output token with a deformable radiance field for encoding the\nquery point under the observation space. Finally, the FDI is employed to\nfurther integrate fine-grained information from reference images. Extensive\nexperiments on ZJU-MoCap and H36M show that our TransHuman achieves a\nsignificantly new state-of-the-art performance with high efficiency. Project\npage: https://pansanity666.github.io/TransHuman/\n","authors":["Xiao Pan","Zongxin Yang","Jianxin Ma","Chang Zhou","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12291v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2303.11041v2","updated":"2023-07-23T10:54:21Z","published":"2023-03-20T11:47:02Z","title":"From Sparse to Precise: A Practical Editing Approach for Intracardiac\n  Echocardiography Segmentation","summary":"  Accurate and safe catheter ablation procedures for patients with atrial\nfibrillation require precise segmentation of cardiac structures in Intracardiac\nEchocardiography (ICE) imaging. Prior studies have suggested methods that\nemploy 3D geometry information from the ICE transducer to create a sparse ICE\nvolume by placing 2D frames in a 3D grid, enabling training of 3D segmentation\nmodels. However, the resulting 3D masks from these models can be inaccurate and\nmay lead to serious clinical complications due to the sparse sampling in ICE\ndata, frames misalignment, and cardiac motion. To address this issue, we\npropose an interactive editing framework that allows users to edit segmentation\noutput by drawing scribbles on a 2D frame. The user interaction is mapped to\nthe 3D grid and utilized to execute an editing step that modifies the\nsegmentation in the vicinity of the interaction while preserving the previous\nsegmentation away from the interaction. Furthermore, our framework accommodates\nmultiple edits to the segmentation output in a sequential manner without\ncompromising previous edits. This paper presents a novel loss function and a\nnovel evaluation metric specifically designed for editing. Results from\ncross-validation and testing indicate that our proposed loss function\noutperforms standard losses and training strategies in terms of segmentation\nquality and following user input. Additionally, we show quantitatively and\nqualitatively that subsequent edits do not compromise previous edits when using\nour method, as opposed to standard segmentation losses. Overall, our approach\nenhances the accuracy of the segmentation while avoiding undesired changes away\nfrom user interactions and without compromising the quality of previously\nedited regions, leading to better patient outcomes.\n","authors":["Ahmed H. Shahin","Yan Zhuang","Noha El-Zehiry"],"pdf_url":"https://arxiv.org/pdf/2303.11041v2.pdf","comment":"Accepted to MICCAI 2023"},{"id":"http://arxiv.org/abs/2306.15880v3","updated":"2023-07-23T10:50:18Z","published":"2023-06-28T02:33:06Z","title":"Towards Open Vocabulary Learning: A Survey","summary":"  In the field of visual scene understanding, deep neural networks have made\nimpressive advancements in various core tasks like segmentation, tracking, and\ndetection. However, most approaches operate on the close-set assumption,\nmeaning that the model can only identify pre-defined categories that are\npresent in the training set. Recently, open vocabulary settings were proposed\ndue to the rapid progress of vision language pre-training. These new approaches\nseek to locate and recognize categories beyond the annotated label space. The\nopen vocabulary approach is more general, practical, and effective compared to\nweakly supervised and zero-shot settings. This paper provides a thorough review\nof open vocabulary learning, summarizing and analyzing recent developments in\nthe field. In particular, we begin by comparing it to related concepts such as\nzero-shot learning, open-set recognition, and out-of-distribution detection.\nThen, we review several closely related tasks in the case of segmentation and\ndetection, including long-tail problems, few-shot, and zero-shot settings. For\nthe method survey, we first present the basic knowledge of detection and\nsegmentation in close-set as the preliminary knowledge. Next, we examine\nvarious scenarios in which open vocabulary learning is used, identifying common\ndesign elements and core ideas. Then, we compare the recent detection and\nsegmentation approaches in commonly used datasets and benchmarks. Finally, we\nconclude with insights, issues, and discussions regarding future research\ndirections. To our knowledge, this is the first comprehensive literature review\nof open vocabulary learning. We keep tracing related works at\nhttps://github.com/jianzongwu/Awesome-Open-Vocabulary.\n","authors":["Jianzong Wu","Xiangtai Li","Shilin Xu","Haobo Yuan","Henghui Ding","Yibo Yang","Xia Li","Jiangning Zhang","Yunhai Tong","Xudong Jiang","Bernard Ghanem","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2306.15880v3.pdf","comment":"Project page: https://github.com/jianzongwu/Awesome-Open-Vocabulary"},{"id":"http://arxiv.org/abs/2301.00805v2","updated":"2023-07-23T10:35:05Z","published":"2023-01-02T18:52:12Z","title":"Betrayed by Captions: Joint Caption Grounding and Generation for Open\n  Vocabulary Instance Segmentation","summary":"  In this work, we focus on open vocabulary instance segmentation to expand a\nsegmentation model to classify and segment instance-level novel categories.\nPrevious approaches have relied on massive caption datasets and complex\npipelines to establish one-to-one mappings between image regions and words in\ncaptions. However, such methods build noisy supervision by matching non-visible\nwords to image regions, such as adjectives and verbs. Meanwhile, context words\nare also important for inferring the existence of novel objects as they show\nhigh inter-correlations with novel categories. To overcome these limitations,\nwe devise a joint \\textbf{Caption Grounding and Generation (CGG)} framework,\nwhich incorporates a novel grounding loss that only focuses on matching object\nnouns to improve learning efficiency. We also introduce a caption generation\nhead that enables additional supervision and contextual modeling as a\ncomplementation to the grounding loss. Our analysis and results demonstrate\nthat grounding and generation components complement each other, significantly\nenhancing the segmentation performance for novel classes. Experiments on the\nCOCO dataset with two settings: Open Vocabulary Instance Segmentation (OVIS)\nand Open Set Panoptic Segmentation (OSPS) demonstrate the superiority of the\nCGG. Specifically, CGG achieves a substantial improvement of 6.8% mAP for novel\nclasses without extra data on the OVIS task and 15% PQ improvements for novel\nclasses on the OSPS benchmark.\n","authors":["Jianzong Wu","Xiangtai Li","Henghui Ding","Xia Li","Guangliang Cheng","Yunhai Tong","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2301.00805v2.pdf","comment":"ICCV-2023"},{"id":"http://arxiv.org/abs/2209.09554v2","updated":"2023-07-23T10:27:35Z","published":"2022-09-20T08:48:26Z","title":"Towards Robust Referring Image Segmentation","summary":"  Referring Image Segmentation (RIS) is a fundamental vision-language task that\noutputs object masks based on text descriptions. Many works have achieved\nconsiderable progress for RIS, including different fusion method designs. In\nthis work, we explore an essential question, ``What if the text description is\nwrong or misleading?'' For example, the described objects are not in the image.\nWe term such a sentence as a negative sentence. However, existing solutions for\nRIS cannot handle such a setting. To this end, we propose a new formulation of\nRIS, named Robust Referring Image Segmentation (R-RIS). It considers the\nnegative sentence inputs besides the regular positive text inputs. To\nfacilitate this new task, we create three R-RIS datasets by augmenting existing\nRIS datasets with negative sentences and propose new metrics to evaluate both\ntypes of inputs in a unified manner. Furthermore, we propose a new\ntransformer-based model, called RefSegformer, with a token-based vision and\nlanguage fusion module. Our design can be easily extended to our R-RIS setting\nby adding extra blank tokens. Our proposed RefSegformer achieves\nstate-of-the-art results on both RIS and R-RIS datasets, establishing a solid\nbaseline for both settings. Our project page is at\n\\url{https://github.com/jianzongwu/robust-ref-seg}.\n","authors":["Jianzong Wu","Xiangtai Li","Xia Li","Henghui Ding","Yunhai Tong","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2209.09554v2.pdf","comment":"update more results"},{"id":"http://arxiv.org/abs/2307.12280v1","updated":"2023-07-23T10:16:47Z","published":"2023-07-23T10:16:47Z","title":"Downstream-agnostic Adversarial Examples","summary":"  Self-supervised learning usually uses a large amount of unlabeled data to\npre-train an encoder which can be used as a general-purpose feature extractor,\nsuch that downstream users only need to perform fine-tuning operations to enjoy\nthe benefit of \"large model\". Despite this promising prospect, the security of\npre-trained encoder has not been thoroughly investigated yet, especially when\nthe pre-trained encoder is publicly available for commercial use.\n  In this paper, we propose AdvEncoder, the first framework for generating\ndownstream-agnostic universal adversarial examples based on the pre-trained\nencoder. AdvEncoder aims to construct a universal adversarial perturbation or\npatch for a set of natural images that can fool all the downstream tasks\ninheriting the victim pre-trained encoder. Unlike traditional adversarial\nexample works, the pre-trained encoder only outputs feature vectors rather than\nclassification labels. Therefore, we first exploit the high frequency component\ninformation of the image to guide the generation of adversarial examples. Then\nwe design a generative attack framework to construct adversarial\nperturbations/patches by learning the distribution of the attack surrogate\ndataset to improve their attack success rates and transferability. Our results\nshow that an attacker can successfully attack downstream tasks without knowing\neither the pre-training dataset or the downstream dataset. We also tailor four\ndefenses for pre-trained encoders, the results of which further prove the\nattack ability of AdvEncoder.\n","authors":["Ziqi Zhou","Shengshan Hu","Ruizhi Zhao","Qian Wang","Leo Yu Zhang","Junhui Hou","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.12280v1.pdf","comment":"This paper has been accepted by the International Conference on\n  Computer Vision (ICCV '23, October 2--6, 2023, Paris, France)"},{"id":"http://arxiv.org/abs/2303.14676v2","updated":"2023-07-23T09:41:51Z","published":"2023-03-26T10:50:16Z","title":"PDPP:Projected Diffusion for Procedure Planning in Instructional Videos","summary":"  In this paper, we study the problem of procedure planning in instructional\nvideos, which aims to make goal-directed plans given the current visual\nobservations in unstructured real-life videos. Previous works cast this problem\nas a sequence planning problem and leverage either heavy intermediate visual\nobservations or natural language instructions as supervision, resulting in\ncomplex learning schemes and expensive annotation costs. In contrast, we treat\nthis problem as a distribution fitting problem. In this sense, we model the\nwhole intermediate action sequence distribution with a diffusion model (PDPP),\nand thus transform the planning problem to a sampling process from this\ndistribution. In addition, we remove the expensive intermediate supervision,\nand simply use task labels from instructional videos as supervision instead.\nOur model is a U-Net based diffusion model, which directly samples action\nsequences from the learned distribution with the given start and end\nobservations. Furthermore, we apply an efficient projection method to provide\naccurate conditional guides for our model during the learning and sampling\nprocess. Experiments on three datasets with different scales show that our PDPP\nmodel can achieve the state-of-the-art performance on multiple metrics, even\nwithout the task supervision. Code and trained models are available at\nhttps://github.com/MCG-NJU/PDPP.\n","authors":["Hanlin Wang","Yilu Wu","Sheng Guo","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2303.14676v2.pdf","comment":"Accepted as a highlight paper at CVPR 2023"},{"id":"http://arxiv.org/abs/2307.12274v1","updated":"2023-07-23T09:34:13Z","published":"2023-07-23T09:34:13Z","title":"FDCT: Fast Depth Completion for Transparent Objects","summary":"  Depth completion is crucial for many robotic tasks such as autonomous\ndriving, 3-D reconstruction, and manipulation. Despite the significant\nprogress, existing methods remain computationally intensive and often fail to\nmeet the real-time requirements of low-power robotic platforms. Additionally,\nmost methods are designed for opaque objects and struggle with transparent\nobjects due to the special properties of reflection and refraction. To address\nthese challenges, we propose a Fast Depth Completion framework for Transparent\nobjects (FDCT), which also benefits downstream tasks like object pose\nestimation. To leverage local information and avoid overfitting issues when\nintegrating it with global information, we design a new fusion branch and\nshortcuts to exploit low-level features and a loss function to suppress\noverfitting. This results in an accurate and user-friendly depth rectification\nframework which can recover dense depth estimation from RGB-D images alone.\nExtensive experiments demonstrate that FDCT can run about 70 FPS with a higher\naccuracy than the state-of-the-art methods. We also demonstrate that FDCT can\nimprove pose estimation in object grasping tasks. The source code is available\nat https://github.com/Nonmy/FDCT\n","authors":["Tianan Li","Zhehan Chen","Huan Liu","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12274v1.pdf","comment":"9pages,7figures"},{"id":"http://arxiv.org/abs/2307.12270v1","updated":"2023-07-23T09:04:13Z","published":"2023-07-23T09:04:13Z","title":"Context Perception Parallel Decoder for Scene Text Recognition","summary":"  Scene text recognition (STR) methods have struggled to attain high accuracy\nand fast inference speed. Autoregressive (AR)-based STR model uses the\npreviously recognized characters to decode the next character iteratively. It\nshows superiority in terms of accuracy. However, the inference speed is slow\nalso due to this iteration. Alternatively, parallel decoding (PD)-based STR\nmodel infers all the characters in a single decoding pass. It has advantages in\nterms of inference speed but worse accuracy, as it is difficult to build a\nrobust recognition context in such a pass. In this paper, we first present an\nempirical study of AR decoding in STR. In addition to constructing a new AR\nmodel with the top accuracy, we find out that the success of AR decoder lies\nalso in providing guidance on visual context perception rather than language\nmodeling as claimed in existing studies. As a consequence, we propose Context\nPerception Parallel Decoder (CPPD) to decode the character sequence in a single\nPD pass. CPPD devises a character counting module and a character ordering\nmodule. Given a text instance, the former infers the occurrence count of each\ncharacter, while the latter deduces the character reading order and\nplaceholders. Together with the character prediction task, they construct a\ncontext that robustly tells what the character sequence is and where the\ncharacters appear, well mimicking the context conveyed by AR decoding.\nExperiments on both English and Chinese benchmarks demonstrate that CPPD models\nachieve highly competitive accuracy. Moreover, they run approximately 7x faster\nthan their AR counterparts, and are also among the fastest recognizers. The\ncode will be released soon.\n","authors":["Yongkun Du","Zhineng Chen","Caiyan Jia","Xiaoting Yin","Chenxia Li","Yuning Du","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.12270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02401v5","updated":"2023-07-23T08:31:15Z","published":"2023-03-04T12:26:47Z","title":"Open-Vocabulary Affordance Detection in 3D Point Clouds","summary":"  Affordance detection is a challenging problem with a wide variety of robotic\napplications. Traditional affordance detection methods are limited to a\npredefined set of affordance labels, hence potentially restricting the\nadaptability of intelligent robots in complex and dynamic environments. In this\npaper, we present the Open-Vocabulary Affordance Detection (OpenAD) method,\nwhich is capable of detecting an unbounded number of affordances in 3D point\nclouds. By simultaneously learning the affordance text and the point feature,\nOpenAD successfully exploits the semantic relationships between affordances.\nTherefore, our proposed method enables zero-shot detection and can be able to\ndetect previously unseen affordances without a single annotation example.\nIntensive experimental results show that OpenAD works effectively on a wide\nrange of affordance detection setups and outperforms other baselines by a large\nmargin. Additionally, we demonstrate the practicality of the proposed OpenAD in\nreal-world robotic applications with a fast inference speed (~100ms). Our\nproject is available at https://openad2023.github.io.\n","authors":["Toan Nguyen","Minh Nhat Vu","An Vuong","Dzung Nguyen","Thieu Vo","Ngan Le","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2303.02401v5.pdf","comment":"Accepted at The 2023 IEEE/RSJ International Conference on Intelligent\n  Robots and Systems (IROS 2023)"},{"id":"http://arxiv.org/abs/2307.12256v1","updated":"2023-07-23T08:02:37Z","published":"2023-07-23T08:02:37Z","title":"Building-road Collaborative Extraction from Remotely Sensed Images via\n  Cross-Interaction","summary":"  Buildings are the basic carrier of social production and human life; roads\nare the links that interconnect social networks. Building and road information\nhas important application value in the frontier fields of regional coordinated\ndevelopment, disaster prevention, auto-driving, etc. Mapping buildings and\nroads from very high-resolution (VHR) remote sensing images have become a hot\nresearch topic. However, the existing methods often ignore the strong spatial\ncorrelation between roads and buildings and extract them in isolation. To fully\nutilize the complementary advantages between buildings and roads, we propose a\nbuilding-road collaborative extraction method based on multi-task and\ncross-scale feature interaction to improve the accuracy of both tasks in a\ncomplementary way. A multi-task interaction module is proposed to interact\ninformation across tasks and preserve the unique information of each task,\nwhich tackle the seesaw phenomenon in multitask learning. By considering the\nvariation in appearance and structure between buildings and roads, a\ncross-scale interaction module is designed to automatically learn the optimal\nreception field for different tasks. Compared with many existing methods that\ntrain each task individually, the proposed collaborative extraction method can\nutilize the complementary advantages between buildings and roads by the\nproposed inter-task and inter-scale feature interactions, and automatically\nselect the optimal reception field for different tasks. Experiments on a wide\nrange of urban and rural scenarios show that the proposed algorithm can achieve\nbuilding-road extraction with outstanding performance and efficiency.\n","authors":["Haonan Guo","Xin Su","Chen Wu","Bo Du","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12256v1.pdf","comment":"34 pages,9 figures, submitted to ISPRS Journal of Photogrammetry and\n  Remote Sensing"},{"id":"http://arxiv.org/abs/2307.12255v1","updated":"2023-07-23T08:02:27Z","published":"2023-07-23T08:02:27Z","title":"ResWCAE: Biometric Pattern Image Denoising Using Residual\n  Wavelet-Conditioned Autoencoder","summary":"  The utilization of biometric authentication with pattern images is\nincreasingly popular in compact Internet of Things (IoT) devices. However, the\nreliability of such systems can be compromised by image quality issues,\nparticularly in the presence of high levels of noise. While state-of-the-art\ndeep learning algorithms designed for generic image denoising have shown\npromise, their large number of parameters and lack of optimization for unique\nbiometric pattern retrieval make them unsuitable for these devices and\nscenarios. In response to these challenges, this paper proposes a lightweight\nand robust deep learning architecture, the Residual Wavelet-Conditioned\nConvolutional Autoencoder (Res-WCAE) with a Kullback-Leibler divergence (KLD)\nregularization, designed specifically for fingerprint image denoising. Res-WCAE\ncomprises two encoders - an image encoder and a wavelet encoder - and one\ndecoder. Residual connections between the image encoder and decoder are\nleveraged to preserve fine-grained spatial features, where the bottleneck layer\nconditioned on the compressed representation of features obtained from the\nwavelet encoder using approximation and detail subimages in the\nwavelet-transform domain. The effectiveness of Res-WCAE is evaluated against\nseveral state-of-the-art denoising methods, and the experimental results\ndemonstrate that Res-WCAE outperforms these methods, particularly for heavily\ndegraded fingerprint images in the presence of high levels of noise. Overall,\nRes-WCAE shows promise as a solution to the challenges faced by biometric\nauthentication systems in compact IoT devices.\n","authors":["Youzhi Liang","Wen Liang"],"pdf_url":"https://arxiv.org/pdf/2307.12255v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2211.11262v3","updated":"2023-07-23T07:18:08Z","published":"2022-11-21T08:51:01Z","title":"Boosting Novel Category Discovery Over Domains with Soft Contrastive\n  Learning and All-in-One Classifier","summary":"  Unsupervised domain adaptation (UDA) has proven to be highly effective in\ntransferring knowledge from a label-rich source domain to a label-scarce target\ndomain. However, the presence of additional novel categories in the target\ndomain has led to the development of open-set domain adaptation (ODA) and\nuniversal domain adaptation (UNDA). Existing ODA and UNDA methods treat all\nnovel categories as a single, unified unknown class and attempt to detect it\nduring training. However, we found that domain variance can lead to more\nsignificant view-noise in unsupervised data augmentation, which affects the\neffectiveness of contrastive learning (CL) and causes the model to be\noverconfident in novel category discovery. To address these issues, a framework\nnamed Soft-contrastive All-in-one Network (SAN) is proposed for ODA and UNDA\ntasks. SAN includes a novel data-augmentation-based soft contrastive learning\n(SCL) loss to fine-tune the backbone for feature transfer and a more\nhuman-intuitive classifier to improve new class discovery capability. The SCL\nloss weakens the adverse effects of the data augmentation view-noise problem\nwhich is amplified in domain transfer tasks. The All-in-One (AIO) classifier\novercomes the overconfidence problem of current mainstream closed-set and\nopen-set classifiers. Visualization and ablation experiments demonstrate the\neffectiveness of the proposed innovations. Furthermore, extensive experiment\nresults on ODA and UNDA show that SAN outperforms existing state-of-the-art\nmethods.\n","authors":["Zelin Zang","Lei Shang","Senqiao Yang","Fei Wang","Baigui Sun","Xuansong Xie","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2211.11262v3.pdf","comment":"Accepted by ICCV"},{"id":"http://arxiv.org/abs/2307.12241v1","updated":"2023-07-23T06:39:51Z","published":"2023-07-23T06:39:51Z","title":"Explainable Depression Detection via Head Motion Patterns","summary":"  While depression has been studied via multimodal non-verbal behavioural cues,\nhead motion behaviour has not received much attention as a biomarker. This\nstudy demonstrates the utility of fundamental head-motion units, termed\n\\emph{kinemes}, for depression detection by adopting two distinct approaches,\nand employing distinctive features: (a) discovering kinemes from head motion\ndata corresponding to both depressed patients and healthy controls, and (b)\nlearning kineme patterns only from healthy controls, and computing statistics\nderived from reconstruction errors for both the patient and control classes.\nEmploying machine learning methods, we evaluate depression classification\nperformance on the \\emph{BlackDog} and \\emph{AVEC2013} datasets. Our findings\nindicate that: (1) head motion patterns are effective biomarkers for detecting\ndepressive symptoms, and (2) explanatory kineme patterns consistent with prior\nfindings can be observed for the two classes. Overall, we achieve peak F1\nscores of 0.79 and 0.82, respectively, over BlackDog and AVEC2013 for binary\nclassification over episodic \\emph{thin-slices}, and a peak F1 of 0.72 over\nvideos for AVEC2013.\n","authors":["Monika Gahalawat","Raul Fernandez Rojas","Tanaya Guha","Ramanathan Subramanian","Roland Goecke"],"pdf_url":"https://arxiv.org/pdf/2307.12241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12239v1","updated":"2023-07-23T06:26:27Z","published":"2023-07-23T06:26:27Z","title":"DQ-Det: Learning Dynamic Query Combinations for Transformer-based Object\n  Detection and Segmentation","summary":"  Transformer-based detection and segmentation methods use a list of learned\ndetection queries to retrieve information from the transformer network and\nlearn to predict the location and category of one specific object from each\nquery. We empirically find that random convex combinations of the learned\nqueries are still good for the corresponding models. We then propose to learn a\nconvex combination with dynamic coefficients based on the high-level semantics\nof the image. The generated dynamic queries, named modulated queries, better\ncapture the prior of object locations and categories in the different images.\nEquipped with our modulated queries, a wide range of DETR-based models achieve\nconsistent and superior performance across multiple tasks including object\ndetection, instance segmentation, panoptic segmentation, and video instance\nsegmentation.\n","authors":["Yiming Cui","Linjie Yang","Haichao Yu"],"pdf_url":"https://arxiv.org/pdf/2307.12239v1.pdf","comment":"12 pages, 4 figures, ICML 2023"},{"id":"http://arxiv.org/abs/2307.12236v1","updated":"2023-07-23T06:03:12Z","published":"2023-07-23T06:03:12Z","title":"Multi-Modal Machine Learning for Assessing Gaming Skills in Online\n  Streaming: A Case Study with CS:GO","summary":"  Online streaming is an emerging market that address much attention. Assessing\ngaming skills from videos is an important task for streaming service providers\nto discover talented gamers. Service providers require the information to offer\ncustomized recommendation and service promotion to their customers. Meanwhile,\nthis is also an important multi-modal machine learning tasks since online\nstreaming combines vision, audio and text modalities. In this study we begin by\nidentifying flaws in the dataset and proceed to clean it manually. Then we\npropose several variants of latest end-to-end models to learn joint\nrepresentation of multiple modalities. Through our extensive experimentation,\nwe demonstrate the efficacy of our proposals. Moreover, we identify that our\nproposed models is prone to identifying users instead of learning meaningful\nrepresentations. We purpose future work to address the issue in the end.\n","authors":["Longxiang Zhang","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12229v1","updated":"2023-07-23T05:31:47Z","published":"2023-07-23T05:31:47Z","title":"EchoGLAD: Hierarchical Graph Neural Networks for Left Ventricle Landmark\n  Detection on Echocardiograms","summary":"  The functional assessment of the left ventricle chamber of the heart requires\ndetecting four landmark locations and measuring the internal dimension of the\nleft ventricle and the approximate mass of the surrounding muscle. The key\nchallenge of automating this task with machine learning is the sparsity of\nclinical labels, i.e., only a few landmark pixels in a high-dimensional image\nare annotated, leading many prior works to heavily rely on isotropic label\nsmoothing. However, such a label smoothing strategy ignores the anatomical\ninformation of the image and induces some bias. To address this challenge, we\nintroduce an echocardiogram-based, hierarchical graph neural network (GNN) for\nleft ventricle landmark detection (EchoGLAD). Our main contributions are: 1) a\nhierarchical graph representation learning framework for multi-resolution\nlandmark detection via GNNs; 2) induced hierarchical supervision at different\nlevels of granularity using a multi-level loss. We evaluate our model on a\npublic and a private dataset under the in-distribution (ID) and\nout-of-distribution (OOD) settings. For the ID setting, we achieve the\nstate-of-the-art mean absolute errors (MAEs) of 1.46 mm and 1.86 mm on the two\ndatasets. Our model also shows better OOD generalization than prior works with\na testing MAE of 4.3 mm.\n","authors":["Masoud Mokhtari","Mobina Mahdavi","Hooman Vaseli","Christina Luong","Purang Abolmaesumi","Teresa S. M. Tsang","Renjie Liao"],"pdf_url":"https://arxiv.org/pdf/2307.12229v1.pdf","comment":"To be published in MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.12225v1","updated":"2023-07-23T04:36:05Z","published":"2023-07-23T04:36:05Z","title":"ASCON: Anatomy-aware Supervised Contrastive Learning Framework for\n  Low-dose CT Denoising","summary":"  While various deep learning methods have been proposed for low-dose computed\ntomography (CT) denoising, most of them leverage the normal-dose CT images as\nthe ground-truth to supervise the denoising process. These methods typically\nignore the inherent correlation within a single CT image, especially the\nanatomical semantics of human tissues, and lack the interpretability on the\ndenoising process. In this paper, we propose a novel Anatomy-aware Supervised\nCONtrastive learning framework, termed ASCON, which can explore the anatomical\nsemantics for low-dose CT denoising while providing anatomical\ninterpretability. The proposed ASCON consists of two novel designs: an\nefficient self-attention-based U-Net (ESAU-Net) and a multi-scale anatomical\ncontrastive network (MAC-Net). First, to better capture global-local\ninteractions and adapt to the high-resolution input, an efficient ESAU-Net is\nintroduced by using a channel-wise self-attention mechanism. Second, MAC-Net\nincorporates a patch-wise non-contrastive module to capture inherent anatomical\ninformation and a pixel-wise contrastive module to maintain intrinsic\nanatomical consistency. Extensive experimental results on two public low-dose\nCT denoising datasets demonstrate superior performance of ASCON over\nstate-of-the-art models. Remarkably, our ASCON provides anatomical\ninterpretability for low-dose CT denoising for the first time. Source code is\navailable at https://github.com/hao1635/ASCON.\n","authors":["Zhihao Chen","Qi Gao","Yi Zhang","Hongming Shan"],"pdf_url":"https://arxiv.org/pdf/2307.12225v1.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2210.03296v2","updated":"2023-07-23T04:28:18Z","published":"2022-10-07T03:09:00Z","title":"GMA3D: Local-Global Attention Learning to Estimate Occluded Motions of\n  Scene Flow","summary":"  Scene flow represents the motion information of each point in the 3D point\nclouds. It is a vital downstream method applied to many tasks, such as motion\nsegmentation and object tracking. However, there are always occlusion points\nbetween two consecutive point clouds, whether from the sparsity data sampling\nor real-world occlusion. In this paper, we focus on addressing occlusion issues\nin scene flow by the semantic self-similarity and motion consistency of the\nmoving objects. We propose a GMA3D module based on the transformer framework,\nwhich utilizes local and global semantic similarity to infer the motion\ninformation of occluded points from the motion information of local and global\nnon-occluded points respectively, and then uses an offset aggregator to\naggregate them. Our module is the first to apply the transformer-based\narchitecture to gauge the scene flow occlusion problem on point clouds.\nExperiments show that our GMA3D can solve the occlusion problem in the scene\nflow, especially in the real scene. We evaluated the proposed method on the\noccluded version of point cloud datasets and get state-of-the-art results on\nthe real scene KITTI dataset. To testify that GMA3D is still beneficial to\nnon-occluded scene flow, we also conducted experiments on non-occluded version\ndatasets and achieved promising performance on FlyThings3D and KITTI. The code\nis available at https://anonymous.4open.science/r/GMA3D-E100.\n","authors":["Zhiyang Lu","Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2210.03296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12220v1","updated":"2023-07-23T03:55:13Z","published":"2023-07-23T03:55:13Z","title":"Expediting Building Footprint Segmentation from High-resolution Remote\n  Sensing Images via progressive lenient supervision","summary":"  The efficacy of building footprint segmentation from remotely sensed images\nhas been hindered by model transfer effectiveness. Many existing building\nsegmentation methods were developed upon the encoder-decoder architecture of\nU-Net, in which the encoder is finetuned from the newly developed backbone\nnetworks that are pre-trained on ImageNet. However, the heavy computational\nburden of the existing decoder designs hampers the successful transfer of these\nmodern encoder networks to remote sensing tasks. Even the widely-adopted deep\nsupervision strategy fails to mitigate these challenges due to its invalid loss\nin hybrid regions where foreground and background pixels are intermixed. In\nthis paper, we conduct a comprehensive evaluation of existing decoder network\ndesigns for building footprint segmentation and propose an efficient framework\ndenoted as BFSeg to enhance learning efficiency and effectiveness.\nSpecifically, a densely-connected coarse-to-fine feature fusion decoder network\nthat facilitates easy and fast feature fusion across scales is proposed.\nMoreover, considering the invalidity of hybrid regions in the down-sampled\nground truth during the deep supervision process, we present a lenient deep\nsupervision and distillation strategy that enables the network to learn proper\nknowledge from deep supervision. Building upon these advancements, we have\ndeveloped a new family of building segmentation networks, which consistently\nsurpass prior works with outstanding performance and efficiency across a wide\nrange of newly developed encoder networks. The code will be released on\nhttps://github.com/HaonanGuo/BFSeg-Efficient-Building-Footprint-Segmentation-Framework.\n","authors":["Haonan Guo","Bo Du","Chen Wu","Xin Su","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12220v1.pdf","comment":"13 pages,8 figures. Submitted to IEEE Transactions on Neural Networks\n  and Learning Systems"},{"id":"http://arxiv.org/abs/2307.12217v1","updated":"2023-07-23T03:38:55Z","published":"2023-07-23T03:38:55Z","title":"LoLep: Single-View View Synthesis with Locally-Learned Planes and\n  Self-Attention Occlusion Inference","summary":"  We propose a novel method, LoLep, which regresses Locally-Learned planes from\na single RGB image to represent scenes accurately, thus generating better novel\nviews. Without the depth information, regressing appropriate plane locations is\na challenging problem. To solve this issue, we pre-partition the disparity\nspace into bins and design a disparity sampler to regress local offsets for\nmultiple planes in each bin. However, only using such a sampler makes the\nnetwork not convergent; we further propose two optimizing strategies that\ncombine with different disparity distributions of datasets and propose an\nocclusion-aware reprojection loss as a simple yet effective geometric\nsupervision technique. We also introduce a self-attention mechanism to improve\nocclusion inference and present a Block-Sampling Self-Attention (BS-SA) module\nto address the problem of applying self-attention to large feature maps. We\ndemonstrate the effectiveness of our approach and generate state-of-the-art\nresults on different datasets. Compared to MINE, our approach has an LPIPS\nreduction of 4.8%-9.0% and an RV reduction of 83.1%-84.7%. We also evaluate the\nperformance on real-world images and demonstrate the benefits.\n","authors":["Cong Wang","Yu-Ping Wang","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2307.12217v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12208v1","updated":"2023-07-23T02:47:30Z","published":"2023-07-23T02:47:30Z","title":"DeepCL: Deep Change Feature Learning on Remote Sensing Images in the\n  Metric Space","summary":"  Change detection (CD) is an important yet challenging task in the Earth\nobservation field for monitoring Earth surface dynamics. The advent of deep\nlearning techniques has recently propelled automatic CD into a technological\nrevolution. Nevertheless, deep learning-based CD methods are still plagued by\ntwo primary issues: 1) insufficient temporal relationship modeling and 2)\npseudo-change misclassification. To address these issues, we complement the\nstrong temporal modeling ability of metric learning with the prominent fitting\nability of segmentation and propose a deep change feature learning (DeepCL)\nframework for robust and explainable CD. Firstly, we designed a hard\nsample-aware contrastive loss, which reweights the importance of hard and\nsimple samples. This loss allows for explicit modeling of the temporal\ncorrelation between bi-temporal remote sensing images. Furthermore, the modeled\ntemporal relations are utilized as knowledge prior to guide the segmentation\nprocess for detecting change regions. The DeepCL framework is thoroughly\nevaluated both theoretically and experimentally, demonstrating its superior\nfeature discriminability, resilience against pseudo changes, and adaptability\nto a variety of CD algorithms. Extensive comparative experiments substantiate\nthe quantitative and qualitative superiority of DeepCL over state-of-the-art CD\napproaches.\n","authors":["Haonan Guo","Bo Du","Chen Wu","Chengxi Han","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12208v1.pdf","comment":"12 pages,7 figures, submitted to IEEE Transactions on Image\n  Processing"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2302.11370v4","updated":"2023-07-23T23:35:25Z","published":"2023-02-22T13:39:54Z","title":"Recall, Robustness, and Lexicographic Evaluation","summary":"  Researchers use recall to evaluate rankings across a variety of retrieval,\nrecommendation, and machine learning tasks. While there is a colloquial\ninterpretation of recall in set-based evaluation, the research community is far\nfrom a principled understanding of recall metrics for rankings. The lack of\nprincipled understanding of or motivation for recall has resulted in criticism\namongst the retrieval community that recall is useful as a measure at all. In\nthis light, we reflect on the measurement of recall in rankings from a formal\nperspective. Our analysis is composed of three tenets: recall, robustness, and\nlexicographic evaluation. First, we formally define `recall-orientation' as\nsensitivity to movement of the bottom-ranked relevant item. Second, we analyze\nour concept of recall orientation from the perspective of robustness with\nrespect to possible searchers and content providers. Finally, we extend this\nconceptual and theoretical treatment of recall by developing a practical\npreference-based evaluation method based on lexicographic comparison. Through\nextensive empirical analysis across 17 TREC tracks, we establish that our new\nevaluation method, lexirecall, is correlated with existing recall metrics and\nexhibits substantially higher discriminative power and stability in the\npresence of missing labels. Our conceptual, theoretical, and empirical analysis\nsubstantially deepens our understanding of recall and motivates its adoption\nthrough connections to robustness and fairness.\n","authors":["Fernando Diaz","Bhaskar Mitra"],"pdf_url":"https://arxiv.org/pdf/2302.11370v4.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2307.12424v1","updated":"2023-07-23T20:34:18Z","published":"2023-07-23T20:34:18Z","title":"Interface Design to Mitigate Inflation in Recommender Systems","summary":"  Recommendation systems rely on user-provided data to learn about item quality\nand provide personalized recommendations. An implicit assumption when\naggregating ratings into item quality is that ratings are strong indicators of\nitem quality. In this work, we test this assumption using data collected from a\nmusic discovery application. Our study focuses on two factors that cause rating\ninflation: heterogeneous user rating behavior and the dynamics of personalized\nrecommendations. We show that user rating behavior substantially varies by\nuser, leading to item quality estimates that reflect the users who rated an\nitem more than the item quality itself. Additionally, items that are more\nlikely to be shown via personalized recommendations can experience a\nsubstantial increase in their exposure and potential bias toward them. To\nmitigate these effects, we analyze the results of a randomized controlled trial\nin which the rating interface was modified. The test resulted in a substantial\nimprovement in user rating behavior and a reduction in item quality inflation.\nThese findings highlight the importance of carefully considering the\nassumptions underlying recommendation systems and designing interfaces that\nencourage accurate rating behavior.\n","authors":["Rana Shahout","Yehonatan Peisakhovsky","Sasha Stoikov","Nikhil Garg"],"pdf_url":"https://arxiv.org/pdf/2307.12424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03691v2","updated":"2023-07-23T17:05:06Z","published":"2023-07-05T23:19:18Z","title":"Comparing Apples to Apples: Generating Aspect-Aware Comparative\n  Sentences from User Reviews","summary":"  It is time-consuming to find the best product among many similar\nalternatives. Comparative sentences can help to contrast one item from others\nin a way that highlights important features of an item that stand out. Given\nreviews of one or multiple items and relevant item features, we generate\ncomparative review sentences to aid users to find the best fit. Specifically,\nour model consists of three successive components in a transformer: (i) an item\nencoding module to encode an item for comparison, (ii) a comparison generation\nmodule that generates comparative sentences in an autoregressive manner, (iii)\na novel decoding method for user personalization. We show that our pipeline\ngenerates fluent and diverse comparative sentences. We run experiments on the\nrelevance and fidelity of our generated sentences in a human evaluation study\nand find that our algorithm creates comparative review sentences that are\nrelevant and truthful.\n","authors":["Jessica Echterhoff","An Yan","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2307.03691v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11019v2","updated":"2023-07-23T16:52:59Z","published":"2023-07-20T16:46:10Z","title":"Investigating the Factual Knowledge Boundary of Large Language Models\n  with Retrieval Augmentation","summary":"  Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require\na substantial amount of factual knowledge and often rely on external\ninformation for assistance. Recently, large language models (LLMs) (e.g.,\nChatGPT), have demonstrated impressive prowess in solving a wide range of tasks\nwith world knowledge, including knowledge-intensive tasks. However, it remains\nunclear how well LLMs are able to perceive their factual knowledge boundaries,\nparticularly how they behave when incorporating retrieval augmentation. In this\nstudy, we present an initial analysis of the factual knowledge boundaries of\nLLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially,\nwe focus on three primary research questions and analyze them by examining QA\nperformance, priori judgement and posteriori judgement of LLMs. We show\nevidence that LLMs possess unwavering confidence in their capabilities to\nrespond to questions and the accuracy of their responses. Furthermore,\nretrieval augmentation proves to be an effective approach in enhancing LLMs'\nawareness of knowledge boundaries, thereby improving their judgemental\nabilities. Additionally, we also find that LLMs have a propensity to rely on\nthe provided retrieval results when formulating answers, while the quality of\nthese results significantly impacts their reliance. The code to reproduce this\nwork is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary.\n","authors":["Ruiyang Ren","Yuhao Wang","Yingqi Qu","Wayne Xin Zhao","Jing Liu","Hao Tian","Hua Wu","Ji-Rong Wen","Haifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11019v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12301v1","updated":"2023-07-23T11:50:27Z","published":"2023-07-23T11:50:27Z","title":"RANSAC-NN: Unsupervised Image Outlier Detection using RANSAC","summary":"  Image outlier detection (OD) is crucial for ensuring the quality and accuracy\nof image datasets used in computer vision tasks. The majority of OD algorithms,\nhowever, have not been targeted toward image data. Consequently, the results of\napplying such algorithms to images are often suboptimal. In this work, we\npropose RANSAC-NN, a novel unsupervised OD algorithm specifically designed for\nimages. By comparing images in a RANSAC-based approach, our algorithm\nautomatically predicts the outlier score of each image without additional\ntraining or label information. We evaluate RANSAC-NN against state-of-the-art\nOD algorithms on 15 diverse datasets. Without any hyperparameter tuning,\nRANSAC-NN consistently performs favorably in contrast to other algorithms in\nalmost every dataset category. Furthermore, we provide a detailed analysis to\nunderstand each RANSAC-NN component, and we demonstrate its potential\napplications in image mislabeled detection. Code for RANSAC-NN is provided at\nhttps://github.com/mxtsai/ransac-nn\n","authors":["Chen-Han Tsai","Yu-Shao Peng"],"pdf_url":"https://arxiv.org/pdf/2307.12301v1.pdf","comment":"19 pages, 18 figures"},{"id":"http://arxiv.org/abs/2109.12887v4","updated":"2023-07-23T01:30:10Z","published":"2021-09-27T09:17:53Z","title":"ICPE: An Item Cluster-Wise Pareto-Efficient Framework for Recommendation\n  Debiasing","summary":"  Recommender system based on historical user-item interactions is of vital\nimportance for web-based services. However, the observed data used to train the\nrecommender model suffers from severe bias issues. Practically, the item\nfrequency distribution of the dataset is a highly skewed power-law\ndistribution. Interactions of a small fraction of head items account for almost\nthe whole training data. The normal training paradigm from such biased data\ntends to repetitively generate recommendations from the head items, which\nfurther exacerbates the biases and affects the exploration of potentially\ninteresting items from the niche set. In this work, we innovatively explore the\ncentral theme of recommendation debiasing from an item cluster-wise\nmulti-objective optimization perspective. Aiming to balance the learning on\nvarious item clusters that differ in popularity during the training process, we\npropose a model-agnostic framework namely Item Cluster-Wise Pareto-Efficient\nRecommendation (ICPE). In detail, we define our item cluster-wise optimization\ntarget as the recommender model should balance all item clusters that differ in\npopularity, thus we set the model learning on each item cluster as a unique\noptimization objective. To achieve this goal, we first explore items'\npopularity levels from a novel causal reasoning perspective. Then, we devise\npopularity discrepancy-based bisecting clustering to separate the item\nclusters. Next, we adaptively find the overall harmonious gradient direction\nfor cluster-wise optimization objectives from a Pareto-efficient solver.\nFinally, in the prediction stage, we perform counterfactual inference to\nfurther eliminate the impact of global propensity. Extensive experimental\nresults verify the superiorities of ICPE on overall recommendation performance\nand biases elimination.\n","authors":["Yule Wang","Xin Xin","Yue Ding","Yunzhe Li","Dong Wang"],"pdf_url":"https://arxiv.org/pdf/2109.12887v4.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2305.18088v5","updated":"2023-07-23T23:53:00Z","published":"2023-05-25T05:34:39Z","title":"Drug Repurposing Targeting COVID-19 3CL Protease using Molecular Docking\n  and Machine Learning Regression Approach","summary":"  The COVID-19 pandemic has created a global health crisis, with an urgent need\nfor effective treatments. Drug repurposing has emerged as a promising solution,\nas it can save time, cost, and labor. However, the number of identified\nrepurposed drugs for COVID-19 treatment remains limited, and there is a need\nfor more efficient and comprehensive drug repurposing approaches. In this\nstudy, we aimed to identify potential therapeutic candidates for COVID-19\ntreatment through drug repurposing using a combination of molecular docking and\nmachine learning regression approaches. We utilized the Zinc database to screen\n5903 World-approved drugs for their potential to target the main protease 3CL\nof SARS-CoV-2, which is a key enzyme in the replication of the virus. We\nperformed molecular docking to evaluate the binding affinity of the drugs to\nthe main protease 3CL, and used several machine learning regression approaches\nfor QSAR modeling to identify drugs with high binding affinity. Our results\nshowed that the Decision Tree Regression (DTR) model had the best statistical\nmeasures of R2 and RMSE, and we shortlisted six promising drugs within the\nrange of -15 kcal/mol to -13 kcal/mol. These drugs have novel repurposing\npotential, except for one antiviral ZINC203757351 compound that has already\nbeen identified in other studies. We further analyzed the physiochemical and\npharmacokinetic properties of these top-ranked selected drugs and their best\nbinding interaction for specific target protease 3CLpro. Our study provides an\nefficient framework for drug repurposing against COVID-19, and demonstrates the\npotential of combining molecular docking with machine learning regression\napproaches to accelerate the identification of potential therapeutic\ncandidates. Our findings contribute to the larger goal of finding effective\ntreatments for COVID-19, which is a critical global health challenge.\n","authors":["Imra Aqeel","Abdul Majid"],"pdf_url":"https://arxiv.org/pdf/2305.18088v5.pdf","comment":"27 Pages"},{"id":"http://arxiv.org/abs/2307.12456v1","updated":"2023-07-23T23:42:06Z","published":"2023-07-23T23:42:06Z","title":"Information-theoretic Analysis of Test Data Sensitivity in Uncertainty","summary":"  Bayesian inference is often utilized for uncertainty quantification tasks. A\nrecent analysis by Xu and Raginsky 2022 rigorously decomposed the predictive\nuncertainty in Bayesian inference into two uncertainties, called aleatoric and\nepistemic uncertainties, which represent the inherent randomness in the\ndata-generating process and the variability due to insufficient data,\nrespectively. They analyzed those uncertainties in an information-theoretic\nway, assuming that the model is well-specified and treating the model's\nparameters as latent variables. However, the existing information-theoretic\nanalysis of uncertainty cannot explain the widely believed property of\nuncertainty, known as the sensitivity between the test and training data. It\nimplies that when test data are similar to training data in some sense, the\nepistemic uncertainty should become small. In this work, we study such\nuncertainty sensitivity using our novel decomposition method for the predictive\nuncertainty. Our analysis successfully defines such sensitivity using\ninformation-theoretic quantities. Furthermore, we extend the existing analysis\nof Bayesian meta-learning and show the novel sensitivities among tasks for the\nfirst time.\n","authors":["Futoshi Futami","Tomoharu Iwata"],"pdf_url":"https://arxiv.org/pdf/2307.12456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.10168v2","updated":"2023-07-23T23:06:21Z","published":"2022-09-21T07:57:27Z","title":"Improving Generalizability of Graph Anomaly Detection Models via Data\n  Augmentation","summary":"  Graph anomaly detection (GAD) is a vital task since even a few anomalies can\npose huge threats to benign users. Recent semi-supervised GAD methods, which\ncan effectively leverage the available labels as prior knowledge, have achieved\nsuperior performances than unsupervised methods. In practice, people usually\nneed to identify anomalies on new (sub)graphs to secure their business, but\nthey may lack labels to train an effective detection model. One natural idea is\nto directly adopt a trained GAD model to the new (sub)graph for testing.\nHowever, we find that existing semi-supervised GAD methods suffer from poor\ngeneralization issue, i.e., well-trained models could not perform well on an\nunseen area (i.e., not accessible in training) of the same graph. It may cause\ngreat troubles. In this paper, we base on the phenomenon and propose a general\nand novel research problem of generalized graph anomaly detection that aims to\neffectively identify anomalies on both the training-domain graph and unseen\ntesting graph to eliminate potential dangers. Nevertheless, it is a challenging\ntask since only limited labels are available, and the normal background may\ndiffer between training and testing data. Accordingly, we propose a data\naugmentation method named \\textit{AugAN} (\\uline{Aug}mentation for\n\\uline{A}nomaly and \\uline{N}ormal distributions) to enrich training data and\nboost the generalizability of GAD models. Experiments verify the effectiveness\nof our method in improving model generalizability.\n","authors":["Shuang Zhou","Xiao Huang","Ninghao Liu","Fu-Lai Chung","Long-Kai Huang"],"pdf_url":"https://arxiv.org/pdf/2209.10168v2.pdf","comment":"The updated version is accepted by TKDE 2023. Please refer to\n  arXiv:2306.10534v1"},{"id":"http://arxiv.org/abs/2307.12451v1","updated":"2023-07-23T23:05:08Z","published":"2023-07-23T23:05:08Z","title":"DiAMoNDBack: Diffusion-denoising Autoregressive Model for\n  Non-Deterministic Backmapping of Cα Protein Traces","summary":"  Coarse-grained molecular models of proteins permit access to length and time\nscales unattainable by all-atom models and the simulation of processes that\noccur on long-time scales such as aggregation and folding. The reduced\nresolution realizes computational accelerations but an atomistic representation\ncan be vital for a complete understanding of mechanistic details. Backmapping\nis the process of restoring all-atom resolution to coarse-grained molecular\nmodels. In this work, we report DiAMoNDBack (Diffusion-denoising Autoregressive\nModel for Non-Deterministic Backmapping) as an autoregressive denoising\ndiffusion probability model to restore all-atom details to coarse-grained\nprotein representations retaining only C{\\alpha} coordinates. The\nautoregressive generation process proceeds from the protein N-terminus to\nC-terminus in a residue-by-residue fashion conditioned on the C{\\alpha} trace\nand previously backmapped backbone and side chain atoms within the local\nneighborhood. The local and autoregressive nature of our model makes it\ntransferable between proteins. The stochastic nature of the denoising diffusion\nprocess means that the model generates a realistic ensemble of backbone and\nside chain all-atom configurations consistent with the coarse-grained C{\\alpha}\ntrace. We train DiAMoNDBack over 65k+ structures from Protein Data Bank (PDB)\nand validate it in applications to a hold-out PDB test set,\nintrinsically-disordered protein structures from the Protein Ensemble Database\n(PED), molecular dynamics simulations of fast-folding mini-proteins from DE\nShaw Research, and coarse-grained simulation data. We achieve state-of-the-art\nreconstruction performance in terms of correct bond formation, avoidance of\nside chain clashes, and diversity of the generated side chain configurational\nstates. We make DiAMoNDBack model publicly available as a free and open source\nPython package.\n","authors":["Michael S. Jones","Kirill Shmilovich","Andrew L. Ferguson"],"pdf_url":"https://arxiv.org/pdf/2307.12451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12450v1","updated":"2023-07-23T22:48:07Z","published":"2023-07-23T22:48:07Z","title":"ProtoFL: Unsupervised Federated Learning via Prototypical Distillation","summary":"  Federated learning (FL) is a promising approach for enhancing data privacy\npreservation, particularly for authentication systems. However, limited round\ncommunications, scarce representation, and scalability pose significant\nchallenges to its deployment, hindering its full potential. In this paper, we\npropose 'ProtoFL', Prototypical Representation Distillation based unsupervised\nFederated Learning to enhance the representation power of a global model and\nreduce round communication costs. Additionally, we introduce a local one-class\nclassifier based on normalizing flows to improve performance with limited data.\nOur study represents the first investigation of using FL to improve one-class\nclassification performance. We conduct extensive experiments on five widely\nused benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and\nKeystroke-Dynamics, to demonstrate the superior performance of our proposed\nframework over previous methods in the literature.\n","authors":["Hansol Kim","Youngjun Kwak","Minyoung Jung","Jinho Shin","Youngsung Kim","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12450v1.pdf","comment":"Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed\n  equally to this work"},{"id":"http://arxiv.org/abs/2305.14675v2","updated":"2023-07-23T22:41:32Z","published":"2023-05-24T03:32:31Z","title":"TriMLP: Revenge of a MLP-like Architecture in Sequential Recommendation","summary":"  Sequential recommendation models sequences of historical user-item\ninteractive behaviors (or referred as token) to better infer dynamic\npreferences. Fueled by the improved neural network architectures such as RNN,\nCNN and Transformer, this field has enjoyed rapid performance boost in the past\nyears. Recent progress on all-MLP models lights on an efficient method with\nless intensive computation, token-mixing MLP, to learn the transformation\npatterns among historical behaviors. However, due to the inherent\nfully-connection design that allows the unrestricted cross-token communication\nand ignores the chronological order, we find that directly applying\ntoken-mixing MLP into sequential recommendation leads to subpar performance. In\nthis paper, we present a purely MLP-based sequential recommendation\narchitecture TriMLP with a novel \\underline{Tri}angular Mixer where the\nmodified \\underline{MLP} endows tokens with ordered interactions. As the\ncross-token interaction in MLP is actually matrix multiplication, Triangular\nMixer drops the lower-triangle neurons in the weight matrix and thus blocks the\nconnections from future tokens, which prevents information leakage and improves\nprediction capability under the standard auto-regressive training fashion. To\nfurther model long and short-term preferences on fine-grained level, the mixer\nadopts a dual-branch structure based on the delicate MLP described above,\nnamely global and local mixing, to separately capture the sequential long-range\ndependencies and local patterns. Empirical study on 9 different scale datasets\n(contain 50K\\textasciitilde20M behaviors) of various benchmarks, including\nMovieLens, Amazon and Tenrec, demonstrates that TriMLP attains promising and\nstable accuracy/efficiency trade-off, i.e., averagely surpasses several\nstate-of-the-art baselines by 5.32\\% and saves 8.44\\% inference time cost.\n","authors":["Yiheng Jiang","Yuanbo Xu","Yongjian Yang","Funing Yang","Pengyang Wang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2305.14675v2.pdf","comment":"15 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.12449v1","updated":"2023-07-23T22:35:09Z","published":"2023-07-23T22:35:09Z","title":"WEPRO: Weight Prediction for Efficient Optimization of Hybrid\n  Quantum-Classical Algorithms","summary":"  The exponential run time of quantum simulators on classical machines and long\nqueue depths and high costs of real quantum devices present significant\nchallenges in the effective training of Variational Quantum Algorithms (VQAs)\nlike Quantum Neural Networks (QNNs), Variational Quantum Eigensolver (VQE) and\nQuantum Approximate Optimization Algorithm (QAOA). To address these\nlimitations, we propose a new approach, WEPRO (Weight Prediction), which\naccelerates the convergence of VQAs by exploiting regular trends in the\nparameter weights. We introduce two techniques for optimal prediction\nperformance namely, Naive Prediction (NaP) and Adaptive Prediction (AdaP).\nThrough extensive experimentation and training of multiple QNN models on\nvarious datasets, we demonstrate that WEPRO offers a speedup of approximately\n$2.25\\times$ compared to standard training methods, while also providing\nimproved accuracy (up to $2.3\\%$ higher) and loss (up to $6.1\\%$ lower) with\nlow storage and computational overheads. We also evaluate WEPRO's effectiveness\nin VQE for molecular ground-state energy estimation and in QAOA for graph\nMaxCut. Our results show that WEPRO leads to speed improvements of up to\n$3.1\\times$ for VQE and $2.91\\times$ for QAOA, compared to traditional\noptimization techniques, while using up to $3.3\\times$ less number of shots\n(i.e., repeated circuit executions) per training iteration.\n","authors":["Satwik Kundu","Debarshi Kundu","Swaroop Ghosh"],"pdf_url":"https://arxiv.org/pdf/2307.12449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12438v1","updated":"2023-07-23T21:46:55Z","published":"2023-07-23T21:46:55Z","title":"Multifidelity Covariance Estimation via Regression on the Manifold of\n  Symmetric Positive Definite Matrices","summary":"  We introduce a multifidelity estimator of covariance matrices formulated as\nthe solution to a regression problem on the manifold of symmetric positive\ndefinite matrices. The estimator is positive definite by construction, and the\nMahalanobis distance minimized to obtain it possesses properties which enable\npractical computation. We show that our manifold regression multifidelity\n(MRMF) covariance estimator is a maximum likelihood estimator under a certain\nerror model on manifold tangent space. More broadly, we show that our\nRiemannian regression framework encompasses existing multifidelity covariance\nestimators constructed from control variates. We demonstrate via numerical\nexamples that our estimator can provide significant decreases, up to one order\nof magnitude, in squared estimation error relative to both single-fidelity and\nother multifidelity covariance estimators. Furthermore, preservation of\npositive definiteness ensures that our estimator is compatible with downstream\ntasks, such as data assimilation and metric learning, in which this property is\nessential.\n","authors":["Aimee Maurais","Terrence Alsup","Benjamin Peherstorfer","Youssef Marzouk"],"pdf_url":"https://arxiv.org/pdf/2307.12438v1.pdf","comment":"30 pages + 15-page supplement"},{"id":"http://arxiv.org/abs/2207.03522v2","updated":"2023-07-23T21:18:29Z","published":"2022-07-07T18:34:34Z","title":"TF-GNN: Graph Neural Networks in TensorFlow","summary":"  TensorFlow-GNN (TF-GNN) is a scalable library for Graph Neural Networks in\nTensorFlow. It is designed from the bottom up to support the kinds of rich\nheterogeneous graph data that occurs in today's information ecosystems. In\naddition to enabling machine learning researchers and advanced developers,\nTF-GNN offers low-code solutions to empower the broader developer community in\ngraph learning. Many production models at Google use TF-GNN, and it has been\nrecently released as an open source project. In this paper we describe the\nTF-GNN data model, its Keras message passing API, and relevant capabilities\nsuch as graph sampling and distributed training.\n","authors":["Oleksandr Ferludin","Arno Eigenwillig","Martin Blais","Dustin Zelle","Jan Pfeifer","Alvaro Sanchez-Gonzalez","Wai Lok Sibon Li","Sami Abu-El-Haija","Peter Battaglia","Neslihan Bulut","Jonathan Halcrow","Filipe Miguel Gonçalves de Almeida","Pedro Gonnet","Liangze Jiang","Parth Kothari","Silvio Lattanzi","André Linhares","Brandon Mayer","Vahab Mirrokni","John Palowitch","Mihir Paradkar","Jennifer She","Anton Tsitsulin","Kevin Villela","Lisa Wang","David Wong","Bryan Perozzi"],"pdf_url":"https://arxiv.org/pdf/2207.03522v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12435v1","updated":"2023-07-23T21:18:04Z","published":"2023-07-23T21:18:04Z","title":"A Generalized Schwarz-type Non-overlapping Domain Decomposition Method\n  using Physics-constrained Neural Networks","summary":"  We present a meshless Schwarz-type non-overlapping domain decomposition\nmethod based on artificial neural networks for solving forward and inverse\nproblems involving partial differential equations (PDEs). To ensure the\nconsistency of solutions across neighboring subdomains, we adopt a generalized\nRobin-type interface condition, assigning unique Robin parameters to each\nsubdomain. These subdomain-specific Robin parameters are learned to minimize\nthe mismatch on the Robin interface condition, facilitating efficient\ninformation exchange during training. Our method is applicable to both the\nLaplace's and Helmholtz equations. It represents local solutions by an\nindependent neural network model which is trained to minimize the loss on the\ngoverning PDE while strictly enforcing boundary and interface conditions\nthrough an augmented Lagrangian formalism. A key strength of our method lies in\nits ability to learn a Robin parameter for each subdomain, thereby enhancing\ninformation exchange with its neighboring subdomains. We observe that the\nlearned Robin parameters adapt to the local behavior of the solution, domain\npartitioning and subdomain location relative to the overall domain. Extensive\nexperiments on forward and inverse problems, including one-way and two-way\ndecompositions with crosspoints, demonstrate the versatility and performance of\nour proposed approach.\n","authors":["Shamsulhaq Basir","Inanc Senocak"],"pdf_url":"https://arxiv.org/pdf/2307.12435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12427v1","updated":"2023-07-23T20:47:03Z","published":"2023-07-23T20:47:03Z","title":"Augmented Box Replay: Overcoming Foreground Shift for Incremental Object\n  Detection","summary":"  In incremental learning, replaying stored samples from previous tasks\ntogether with current task samples is one of the most efficient approaches to\naddress catastrophic forgetting. However, unlike incremental classification,\nimage replay has not been successfully applied to incremental object detection\n(IOD). In this paper, we identify the overlooked problem of foreground shift as\nthe main reason for this. Foreground shift only occurs when replaying images of\nprevious tasks and refers to the fact that their background might contain\nforeground objects of the current task. To overcome this problem, a novel and\nefficient Augmented Box Replay (ABR) method is developed that only stores and\nreplays foreground objects and thereby circumvents the foreground shift\nproblem. In addition, we propose an innovative Attentive RoI Distillation loss\nthat uses spatial attention from region-of-interest (RoI) features to constrain\ncurrent model to focus on the most important information from old model. ABR\nsignificantly reduces forgetting of previous classes while maintaining high\nplasticity in current classes. Moreover, it considerably reduces the storage\nrequirements when compared to standard image replay. Comprehensive experiments\non Pascal-VOC and COCO datasets support the state-of-the-art performance of our\nmodel.\n","authors":["Liu Yuyang","Cong Yang","Goswami Dipam","Liu Xialei","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2307.12427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07989v3","updated":"2023-07-23T20:21:48Z","published":"2023-02-15T23:18:47Z","title":"From Graph Generation to Graph Classification","summary":"  This note describes a new approach to classifying graphs that leverages graph\ngenerative models (GGM). Assuming a GGM that defines a joint probability\ndistribution over graphs and their class labels, I derive classification\nformulas for the probability of a class label given a graph. A new conditional\nELBO can be used to train a generative graph auto-encoder model for\ndiscrimination. While leveraging generative models for classification has been\nwell explored for non-relational i.i.d. data, to our knowledge it is a novel\napproach to graph classification.\n","authors":["Oliver Schulte"],"pdf_url":"https://arxiv.org/pdf/2302.07989v3.pdf","comment":"I welcome suggestions, comments, and proposals for collaboration to\n  develop further the ideas in this paper. Please email oschulte@cs.sfu.ca. I\n  am grateful to Renjie Liao for helpful comments"},{"id":"http://arxiv.org/abs/2206.02058v3","updated":"2023-07-23T20:17:42Z","published":"2022-06-04T20:49:31Z","title":"When Personalization Harms: Reconsidering the Use of Group Attributes in\n  Prediction","summary":"  Machine learning models are often personalized with categorical attributes\nthat are protected, sensitive, self-reported, or costly to acquire. In this\nwork, we show models that are personalized with group attributes can reduce\nperformance at a group level. We propose formal conditions to ensure the \"fair\nuse\" of group attributes in prediction tasks by training one additional model\n-- i.e., collective preference guarantees to ensure that each group who\nprovides personal data will receive a tailored gain in performance in return.\nWe present sufficient conditions to ensure fair use in empirical risk\nminimization and characterize failure modes that lead to fair use violations\ndue to standard practices in model development and deployment. We present a\ncomprehensive empirical study of fair use in clinical prediction tasks. Our\nresults demonstrate the prevalence of fair use violations in practice and\nillustrate simple interventions to mitigate their harm.\n","authors":["Vinith M. Suriyakumar","Marzyeh Ghassemi","Berk Ustun"],"pdf_url":"https://arxiv.org/pdf/2206.02058v3.pdf","comment":"ICML 2023 Oral"},{"id":"http://arxiv.org/abs/2307.12417v1","updated":"2023-07-23T20:01:18Z","published":"2023-07-23T20:01:18Z","title":"Practical Commercial 5G Standalone (SA) Uplink Throughput Prediction","summary":"  While the 5G New Radio (NR) network promises a huge uplift of the uplink\nthroughput, the improvement can only be seen when the User Equipment (UE) is\nconnected to the high-frequency millimeter wave (mmWave) band. With the rise of\nuplink-intensive smartphone applications such as the real-time transmission of\nUHD 4K/8K videos, and Virtual Reality (VR)/Augmented Reality (AR) contents,\nuplink throughput prediction plays a huge role in maximizing the users' quality\nof experience (QoE). In this paper, we propose using a ConvLSTM-based neural\nnetwork to predict the future uplink throughput based on past uplink throughput\nand RF parameters. The network is trained using the data from real-world drive\ntests on commercial 5G SA networks while riding commuter trains, which\naccounted for various frequency bands, handover, and blind spots. To make sure\nour model can be practically implemented, we then limited our model to only use\nthe information available via Android API, then evaluate our model using the\ndata from both commuter trains and other methods of transportation. The results\nshow that our model reaches an average prediction accuracy of 98.9\\% with an\naverage RMSE of 1.80 Mbps across all unseen evaluation scenarios.\n","authors":["Kasidis Arunruangsirilert","Jiro Katto"],"pdf_url":"https://arxiv.org/pdf/2307.12417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.00928v2","updated":"2023-07-23T19:59:20Z","published":"2022-11-02T07:30:19Z","title":"Neural Active Learning on Heteroskedastic Distributions","summary":"  Models that can actively seek out the best quality training data hold the\npromise of more accurate, adaptable, and efficient machine learning. Active\nlearning techniques often tend to prefer examples that are the most difficult\nto classify. While this works well on homogeneous datasets, we find that it can\nlead to catastrophic failures when performed on multiple distributions with\ndifferent degrees of label noise or heteroskedasticity. These active learning\nalgorithms strongly prefer to draw from the distribution with more noise, even\nif their examples have no informative structure (such as solid color images\nwith random labels). To this end, we demonstrate the catastrophic failure of\nthese active learning algorithms on heteroskedastic distributions and propose a\nfine-tuning-based approach to mitigate these failures. Further, we propose a\nnew algorithm that incorporates a model difference scoring function for each\ndata point to filter out the noisy examples and sample clean examples that\nmaximize accuracy, outperforming the existing active learning techniques on the\nheteroskedastic datasets. We hope these observations and techniques are\nimmediately helpful to practitioners and can help to challenge common\nassumptions in the design of active learning algorithms.\n","authors":["Savya Khosla","Chew Kin Whye","Jordan T. Ash","Cyril Zhang","Kenji Kawaguchi","Alex Lamb"],"pdf_url":"https://arxiv.org/pdf/2211.00928v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13757v3","updated":"2023-07-23T19:51:25Z","published":"2023-01-31T16:45:49Z","title":"Toward Efficient Gradient-Based Value Estimation","summary":"  Gradient-based methods for value estimation in reinforcement learning have\nfavorable stability properties, but they are typically much slower than\nTemporal Difference (TD) learning methods. We study the root causes of this\nslowness and show that Mean Square Bellman Error (MSBE) is an ill-conditioned\nloss function in the sense that its Hessian has large condition-number. To\nresolve the adverse effect of poor conditioning of MSBE on gradient based\nmethods, we propose a low complexity batch-free proximal method that\napproximately follows the Gauss-Newton direction and is asymptotically robust\nto parameterization. Our main algorithm, called RANS, is efficient in the sense\nthat it is significantly faster than the residual gradient methods while having\nalmost the same computational complexity, and is competitive with TD on the\nclassic problems that we tested.\n","authors":["Arsalan Sharifnassab","Richard Sutton"],"pdf_url":"https://arxiv.org/pdf/2301.13757v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.10012v3","updated":"2023-07-23T19:50:41Z","published":"2022-10-18T17:30:02Z","title":"Log-linear Guardedness and its Implications","summary":"  Methods for erasing human-interpretable concepts from neural representations\nthat assume linearity have been found to be tractable and useful. However, the\nimpact of this removal on the behavior of downstream classifiers trained on the\nmodified representations is not fully understood. In this work, we formally\ndefine the notion of log-linear guardedness as the inability of an adversary to\npredict the concept directly from the representation, and study its\nimplications. We show that, in the binary case, under certain assumptions, a\ndownstream log-linear model cannot recover the erased concept. However, we\ndemonstrate that a multiclass log-linear model \\emph{can} be constructed that\nindirectly recovers the concept in some cases, pointing to the inherent\nlimitations of log-linear guardedness as a downstream bias mitigation\ntechnique. These findings shed light on the theoretical limitations of linear\nerasure methods and highlight the need for further research on the connections\nbetween intrinsic and extrinsic bias in neural models.\n","authors":["Shauli Ravfogel","Yoav Goldberg","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2210.10012v3.pdf","comment":"Accepted as a long paper in ACL 2023"},{"id":"http://arxiv.org/abs/2307.12409v1","updated":"2023-07-23T19:23:06Z","published":"2023-07-23T19:23:06Z","title":"A Machine Learning Approach to Two-Stage Adaptive Robust Optimization","summary":"  We propose an approach based on machine learning to solve two-stage linear\nadaptive robust optimization (ARO) problems with binary here-and-now variables\nand polyhedral uncertainty sets. We encode the optimal here-and-now decisions,\nthe worst-case scenarios associated with the optimal here-and-now decisions,\nand the optimal wait-and-see decisions into what we denote as the strategy. We\nsolve multiple similar ARO instances in advance using the column and constraint\ngeneration algorithm and extract the optimal strategies to generate a training\nset. We train a machine learning model that predicts high-quality strategies\nfor the here-and-now decisions, the worst-case scenarios associated with the\noptimal here-and-now decisions, and the wait-and-see decisions. We also\nintroduce an algorithm to reduce the number of different target classes the\nmachine learning algorithm needs to be trained on. We apply the proposed\napproach to the facility location, the multi-item inventory control and the\nunit commitment problems. Our approach solves ARO problems drastically faster\nthan the state-of-the-art algorithms with high accuracy.\n","authors":["Dimitris Bertsimas","Cheol Woo Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12405v1","updated":"2023-07-23T19:12:44Z","published":"2023-07-23T19:12:44Z","title":"Optimal Control of Multiclass Fluid Queueing Networks: A Machine\n  Learning Approach","summary":"  We propose a machine learning approach to the optimal control of multiclass\nfluid queueing networks (MFQNETs) that provides explicit and insightful control\npolicies. We prove that a threshold type optimal policy exists for MFQNET\ncontrol problems, where the threshold curves are hyperplanes passing through\nthe origin. We use Optimal Classification Trees with hyperplane splits (OCT-H)\nto learn an optimal control policy for MFQNETs. We use numerical solutions of\nMFQNET control problems as a training set and apply OCT-H to learn explicit\ncontrol policies. We report experimental results with up to 33 servers and 99\nclasses that demonstrate that the learned policies achieve 100\\% accuracy on\nthe test set. While the offline training of OCT-H can take days in large\nnetworks, the online application takes milliseconds.\n","authors":["Dimitris Bertsimas","Cheol Woo Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17108v2","updated":"2023-07-23T19:06:20Z","published":"2023-06-29T17:08:53Z","title":"ManimML: Communicating Machine Learning Architectures with Animation","summary":"  There has been an explosion in interest in machine learning (ML) in recent\nyears due to its applications to science and engineering. However, as ML\ntechniques have advanced, tools for explaining and visualizing novel ML\nalgorithms have lagged behind. Animation has been shown to be a powerful tool\nfor making engaging visualizations of systems that dynamically change over\ntime, which makes it well suited to the task of communicating ML algorithms.\nHowever, the current approach to animating ML algorithms is to handcraft\napplications that highlight specific algorithms or use complex generalized\nanimation software. We developed ManimML, an open-source Python library for\neasily generating animations of ML algorithms directly from code. We sought to\nleverage ML practitioners' preexisting knowledge of programming rather than\nrequiring them to learn complex animation software. ManimML has a familiar\nsyntax for specifying neural networks that mimics popular deep learning\nframeworks like Pytorch. A user can take a preexisting neural network\narchitecture and easily write a specification for an animation in ManimML,\nwhich will then automatically compose animations for different components of\nthe system into a final animation of the entire neural network. ManimML is open\nsource and available at https://github.com/helblazer811/ManimML.\n","authors":["Alec Helbling","Duen Horng Chau"],"pdf_url":"https://arxiv.org/pdf/2306.17108v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09256v2","updated":"2023-07-23T18:17:37Z","published":"2023-06-15T16:32:08Z","title":"A Survey of Some Density Based Clustering Techniques","summary":"  Density Based Clustering are a type of Clustering methods using in data\nmining for extracting previously unknown patterns from data sets. There are a\nnumber of density based clustering methods such as DBSCAN, OPTICS, DENCLUE,\nVDBSCAN, DVBSCAN, DBCLASD and ST-DBSCAN. In this paper, a study of these\nmethods is done along with their characteristics, advantages and disadvantages\nand most importantly, their applicability to different types of data sets to\nmine useful and appropriate patterns.\n","authors":["Rupanka Bhuyan","Samarjeet Borah"],"pdf_url":"https://arxiv.org/pdf/2306.09256v2.pdf","comment":"4 pages, 1 figure, conference paper"},{"id":"http://arxiv.org/abs/2306.15907v2","updated":"2023-07-23T17:51:51Z","published":"2023-06-28T04:15:01Z","title":"Deep Learning Models for Water Stage Predictions in South Florida","summary":"  Simulating and predicting water levels in river systems is essential for\nflood warnings, hydraulic operations, and flood mitigations. In the engineering\nfield, tools such as HEC-RAS, MIKE, and SWMM are used to build detailed\nphysics-based hydrological and hydraulic computational models to simulate the\nentire watershed, thereby predicting the water stage at any point in the\nsystem. However, these physics-based models are computationally intensive,\nespecially for large watersheds and for longer simulations. To overcome this\nproblem, we train several deep learning (DL) models for use as surrogate models\nto rapidly predict the water stage. The downstream stage of the Miami River in\nSouth Florida is chosen as a case study for this paper. The dataset is from\nJanuary 1, 2010, to December 31, 2020, downloaded from the DBHYDRO database of\nthe South Florida Water Management District (SFWMD). Extensive experiments show\nthat the performance of the DL models is comparable to that of the\nphysics-based models, even during extreme precipitation conditions (i.e.,\ntropical storms). Furthermore, we study the decline in prediction accuracy of\nthe DL models with an increase in prediction lengths. In order to predict the\nwater stage in the future, our DL models use measured variables of the river\nsystem from the recent past as well as covariates that can be reliably\npredicted in the near future. In summary, the deep learning models achieve\ncomparable or better error rates with at least 1000x speedup in comparison to\nthe physics-based models.\n","authors":["Jimeng Shi","Zeda Yin","Rukmangadh Myana","Khandker Ishtiaq","Anupama John","Jayantha Obeysekera","Arturo Leon","Giri Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2306.15907v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12388v1","updated":"2023-07-23T17:35:49Z","published":"2023-07-23T17:35:49Z","title":"Uncertainty-aware Grounded Action Transformation towards Sim-to-Real\n  Transfer for Traffic Signal Control","summary":"  Traffic signal control (TSC) is a complex and important task that affects the\ndaily lives of millions of people. Reinforcement Learning (RL) has shown\npromising results in optimizing traffic signal control, but current RL-based\nTSC methods are mainly trained in simulation and suffer from the performance\ngap between simulation and the real world. In this paper, we propose a\nsimulation-to-real-world (sim-to-real) transfer approach called UGAT, which\ntransfers a learned policy trained from a simulated environment to a real-world\nenvironment by dynamically transforming actions in the simulation with\nuncertainty to mitigate the domain gap of transition dynamics. We evaluate our\nmethod on a simulated traffic environment and show that it significantly\nimproves the performance of the transferred RL policy in the real world.\n","authors":["Longchao Da","Hao Mei","Romir Sharma","Hua Wei"],"pdf_url":"https://arxiv.org/pdf/2307.12388v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.12375v1","updated":"2023-07-23T16:54:41Z","published":"2023-07-23T16:54:41Z","title":"In-Context Learning in Large Language Models Learns Label Relationships\n  but Is Not Conventional Learning","summary":"  The performance of Large Language Models (LLMs) on downstream tasks often\nimproves significantly when including examples of the input-label relationship\nin the context. However, there is currently no consensus about how this\nin-context learning (ICL) ability of LLMs works: for example, while Xie et al.\n(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)\nargue ICL does not even learn label relationships from in-context examples. In\nthis paper, we study (1) how labels of in-context examples affect predictions,\n(2) how label relationships learned during pre-training interact with\ninput-label examples provided in-context, and (3) how ICL aggregates label\ninformation across in-context examples. Our findings suggests LLMs usually\nincorporate information from in-context labels, but that pre-training and\nin-context label relationships are treated differently, and that the model does\nnot consider all in-context information equally. Our results give insights into\nunderstanding and aligning LLM behavior.\n","authors":["Jannik Kossen","Tom Rainforth","Yarin Gal"],"pdf_url":"https://arxiv.org/pdf/2307.12375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12369v1","updated":"2023-07-23T16:38:16Z","published":"2023-07-23T16:38:16Z","title":"Early Prediction of Alzheimers Disease Leveraging Symptom Occurrences\n  from Longitudinal Electronic Health Records of US Military Veterans","summary":"  Early prediction of Alzheimer's disease (AD) is crucial for timely\nintervention and treatment. This study aims to use machine learning approaches\nto analyze longitudinal electronic health records (EHRs) of patients with AD\nand identify signs and symptoms that can predict AD onset earlier. We used a\ncase-control design with longitudinal EHRs from the U.S. Department of Veterans\nAffairs Veterans Health Administration (VHA) from 2004 to 2021. Cases were VHA\npatients with AD diagnosed after 1/1/2016 based on ICD-10-CM codes, matched 1:9\nwith controls by age, sex and clinical utilization with replacement. We used a\npanel of AD-related keywords and their occurrences over time in a patient's\nlongitudinal EHRs as predictors for AD prediction with four machine learning\nmodels. We performed subgroup analyses by age, sex, and race/ethnicity, and\nvalidated the model in a hold-out and \"unseen\" VHA stations group. Model\ndiscrimination, calibration, and other relevant metrics were reported for\npredictions up to ten years before ICD-based diagnosis. The study population\nincluded 16,701 cases and 39,097 matched controls. The average number of\nAD-related keywords (e.g., \"concentration\", \"speaking\") per year increased\nrapidly for cases as diagnosis approached, from around 10 to over 40, while\nremaining flat at 10 for controls. The best model achieved high discriminative\naccuracy (ROCAUC 0.997) for predictions using data from at least ten years\nbefore ICD-based diagnoses. The model was well-calibrated (Hosmer-Lemeshow\ngoodness-of-fit p-value = 0.99) and consistent across subgroups of age, sex and\nrace/ethnicity, except for patients younger than 65 (ROCAUC 0.746). Machine\nlearning models using AD-related keywords identified from EHR notes can predict\nfuture AD diagnoses, suggesting its potential use for identifying AD risk using\nEHR notes, offering an affordable way for early screening on large population.\n","authors":["Rumeng Li","Xun Wang","Dan Berlowitz","Brian Silver","Wen Hu","Heather Keating","Raelene Goodwin","Weisong Liu","Honghuang Lin","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2307.12369v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2111.09564v3","updated":"2023-07-23T16:02:01Z","published":"2021-11-18T07:46:35Z","title":"LAnoBERT: System Log Anomaly Detection based on BERT Masked Language\n  Model","summary":"  The system log generated in a computer system refers to large-scale data that\nare collected simultaneously and used as the basic data for determining errors,\nintrusion and abnormal behaviors. The aim of system log anomaly detection is to\npromptly identify anomalies while minimizing human intervention, which is a\ncritical problem in the industry. Previous studies performed anomaly detection\nthrough algorithms after converting various forms of log data into a\nstandardized template using a parser. Particularly, a template corresponding to\na specific event should be defined in advance for all the log data using which\nthe information within the log key may get lost. In this study, we propose\nLAnoBERT, a parser free system log anomaly detection method that uses the BERT\nmodel, exhibiting excellent natural language processing performance. The\nproposed method, LAnoBERT, learns the model through masked language modeling,\nwhich is a BERT-based pre-training method, and proceeds with unsupervised\nlearning-based anomaly detection using the masked language modeling loss\nfunction per log key during the test process. In addition, we also propose an\nefficient inference process to establish a practically applicable pipeline to\nthe actual system. Experiments on three well-known log datasets, i.e., HDFS,\nBGL, and Thunderbird, show that not only did LAnoBERT yield a higher anomaly\ndetection performance compared to unsupervised learning-based benchmark models,\nbut also it resulted in a comparable performance with supervised learning-based\nbenchmark models.\n","authors":["Yukyung Lee","Jina Kim","Pilsung Kang"],"pdf_url":"https://arxiv.org/pdf/2111.09564v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12344v1","updated":"2023-07-23T14:43:17Z","published":"2023-07-23T14:43:17Z","title":"Right for the Wrong Reason: Can Interpretable ML Techniques Detect\n  Spurious Correlations?","summary":"  While deep neural network models offer unmatched classification performance,\nthey are prone to learning spurious correlations in the data. Such dependencies\non confounding information can be difficult to detect using performance metrics\nif the test data comes from the same distribution as the training data.\nInterpretable ML methods such as post-hoc explanations or inherently\ninterpretable classifiers promise to identify faulty model reasoning. However,\nthere is mixed evidence whether many of these techniques are actually able to\ndo so. In this paper, we propose a rigorous evaluation strategy to assess an\nexplanation technique's ability to correctly identify spurious correlations.\nUsing this strategy, we evaluate five post-hoc explanation techniques and one\ninherently interpretable method for their ability to detect three types of\nartificially added confounders in a chest x-ray diagnosis task. We find that\nthe post-hoc technique SHAP, as well as the inherently interpretable Attri-Net\nprovide the best performance and can be used to reliably identify faulty model\nbehavior.\n","authors":["Susu Sun","Lisa M. Koch","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2307.12344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12343v1","updated":"2023-07-23T14:40:50Z","published":"2023-07-23T14:40:50Z","title":"Self-Supervised Learning for Audio-Based Emotion Recognition","summary":"  Emotion recognition models using audio input data can enable the development\nof interactive systems with applications in mental healthcare, marketing,\ngaming, and social media analysis. While the field of affective computing using\naudio data is rich, a major barrier to achieve consistently high-performance\nmodels is the paucity of available training labels. Self-supervised learning\n(SSL) is a family of methods which can learn despite a scarcity of supervised\nlabels by predicting properties of the data itself. To understand the utility\nof self-supervised learning for audio-based emotion recognition, we have\napplied self-supervised learning pre-training to the classification of emotions\nfrom the CMU- MOSEI's acoustic modality. Unlike prior papers that have\nexperimented with raw acoustic data, our technique has been applied to encoded\nacoustic data. Our model is first pretrained to uncover the randomly-masked\ntimestamps of the acoustic data. The pre-trained model is then fine-tuned using\na small sample of annotated data. The performance of the final model is then\nevaluated via several evaluation metrics against a baseline deep learning model\nwith an identical backbone architecture. We find that self-supervised learning\nconsistently improves the performance of the model across all metrics. This\nwork shows the utility of self-supervised learning for affective computing,\ndemonstrating that self-supervised learning is most useful when the number of\ntraining examples is small, and that the effect is most pronounced for emotions\nwhich are easier to classify such as happy, sad and anger. This work further\ndemonstrates that self-supervised learning works when applied to embedded\nfeature representations rather than the traditional approach of pre-training on\nthe raw input space.\n","authors":["Peranut Nimitsurachat","Peter Washington"],"pdf_url":"https://arxiv.org/pdf/2307.12343v1.pdf","comment":"8 pages, 9 figures, submitted to IEEE Transactions on Affective\n  Computing"},{"id":"http://arxiv.org/abs/2307.12341v1","updated":"2023-07-23T14:32:07Z","published":"2023-07-23T14:32:07Z","title":"Rapid detection of soil carbonates by means of NIR spectroscopy, deep\n  learning methods and phase quantification by powder Xray diffraction","summary":"  Soil NIR spectral absorbance/reflectance libraries are utilized towards\nimproving agricultural production and analysis of soil properties which are key\nprerequisite for agroecological balance and environmental sustainability.\nCarbonates in particular, represent a soil property which is mostly affected\neven by mild, let alone extreme, changes of environmental conditions during\nclimate change. In this study we propose a rapid and efficient way to predict\ncarbonates content in soil by means of FT NIR reflectance spectroscopy and by\nuse of deep learning methods. We exploited multiple machine learning methods,\nsuch as: 1) a MLP Regressor and 2) a CNN and compare their performance with\nother traditional ML algorithms such as PLSR, Cubist and SVM on the combined\ndataset of two NIR spectral libraries: KSSL (USDA), a dataset of soil samples\nreflectance spectra collected nationwide, and LUCAS TopSoil (European Soil\nLibrary) which contains soil sample absorbance spectra from all over the\nEuropean Union, and use them to predict carbonate content on never before seen\nsoil samples. Soil samples in KSSL and in TopSoil spectral libraries were\nacquired in the spectral region of visNIR, however in this study, only the NIR\nspectral region was utilized. Quantification of carbonates by means of Xray\nDiffraction is in good agreement with the volumetric method and the MLP\nprediction. Our work contributes to rapid carbonates content prediction in soil\nsamples in cases where: 1) no volumetric method is available and 2) only NIR\nspectra absorbance data are available. Up till now and to the best of our\nknowledge, there exists no other study, that presents a prediction model\ntrained on such an extensive dataset with such promising results on unseen\ndata, undoubtedly supporting the notion that deep learning models present\nexcellent prediction tools for soil carbonates content.\n","authors":["Lykourgos Chiniadis","Petros Tamvakis"],"pdf_url":"https://arxiv.org/pdf/2307.12341v1.pdf","comment":"39 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.12336v1","updated":"2023-07-23T14:02:33Z","published":"2023-07-23T14:02:33Z","title":"TabADM: Unsupervised Tabular Anomaly Detection with Diffusion Models","summary":"  Tables are an abundant form of data with use cases across all scientific\nfields. Real-world datasets often contain anomalous samples that can negatively\naffect downstream analysis. In this work, we only assume access to contaminated\ndata and present a diffusion-based probabilistic model effective for\nunsupervised anomaly detection. Our model is trained to learn the density of\nnormal samples by utilizing a unique rejection scheme to attenuate the\ninfluence of anomalies on the density estimation. At inference, we identify\nanomalies as samples in low-density regions. We use real data to demonstrate\nthat our method improves detection capabilities over baselines. Furthermore,\nour method is relatively stable to the dimension of the data and does not\nrequire extensive hyperparameter tuning.\n","authors":["Guy Zamberg","Moshe Salhov","Ofir Lindenbaum","Amir Averbuch"],"pdf_url":"https://arxiv.org/pdf/2307.12336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12333v1","updated":"2023-07-23T14:00:33Z","published":"2023-07-23T14:00:33Z","title":"An axiomatized PDE model of deep neural networks","summary":"  Inspired by the relation between deep neural network (DNN) and partial\ndifferential equations (PDEs), we study the general form of the PDE models of\ndeep neural networks. To achieve this goal, we formulate DNN as an evolution\noperator from a simple base model. Based on several reasonable assumptions, we\nprove that the evolution operator is actually determined by\nconvection-diffusion equation. This convection-diffusion equation model gives\nmathematical explanation for several effective networks. Moreover, we show that\nthe convection-diffusion model improves the robustness and reduces the\nRademacher complexity. Based on the convection-diffusion equation, we design a\nnew training method for ResNets. Experiments validate the performance of the\nproposed method.\n","authors":["Tangjun Wang","Wenqi Tao","Chenglong Bao","Zuoqiang Shi"],"pdf_url":"https://arxiv.org/pdf/2307.12333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07464v2","updated":"2023-07-23T13:51:34Z","published":"2023-01-18T12:16:19Z","title":"CLIPTER: Looking at the Bigger Picture in Scene Text Recognition","summary":"  Reading text in real-world scenarios often requires understanding the context\nsurrounding it, especially when dealing with poor-quality text. However,\ncurrent scene text recognizers are unaware of the bigger picture as they\noperate on cropped text images. In this study, we harness the representative\ncapabilities of modern vision-language models, such as CLIP, to provide\nscene-level information to the crop-based recognizer. We achieve this by fusing\na rich representation of the entire image, obtained from the vision-language\nmodel, with the recognizer word-level features via a gated cross-attention\nmechanism. This component gradually shifts to the context-enhanced\nrepresentation, allowing for stable fine-tuning of a pretrained recognizer. We\ndemonstrate the effectiveness of our model-agnostic framework, CLIPTER (CLIP\nTExt Recognition), on leading text recognition architectures and achieve\nstate-of-the-art results across multiple benchmarks. Furthermore, our analysis\nhighlights improved robustness to out-of-vocabulary words and enhanced\ngeneralization in low-data regimes.\n","authors":["Aviad Aberdam","David Bensaïd","Alona Golts","Roy Ganz","Oren Nuriel","Royee Tichauer","Shai Mazor","Ron Litman"],"pdf_url":"https://arxiv.org/pdf/2301.07464v2.pdf","comment":"Accepted for publication by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10234v2","updated":"2023-07-23T13:48:15Z","published":"2023-07-16T05:33:35Z","title":"SentimentGPT: Exploiting GPT for Advanced Sentiment Analysis and its\n  Departure from Current Machine Learning","summary":"  This study presents a thorough examination of various Generative Pretrained\nTransformer (GPT) methodologies in sentiment analysis, specifically in the\ncontext of Task 4 on the SemEval 2017 dataset. Three primary strategies are\nemployed: 1) prompt engineering using the advanced GPT-3.5 Turbo, 2)\nfine-tuning GPT models, and 3) an inventive approach to embedding\nclassification. The research yields detailed comparative insights among these\nstrategies and individual GPT models, revealing their unique strengths and\npotential limitations. Additionally, the study compares these GPT-based\nmethodologies with other current, high-performing models previously used with\nthe same dataset. The results illustrate the significant superiority of the GPT\napproaches in terms of predictive performance, more than 22\\% in F1-score\ncompared to the state-of-the-art. Further, the paper sheds light on common\nchallenges in sentiment analysis tasks, such as understanding context and\ndetecting sarcasm. It underscores the enhanced capabilities of the GPT models\nto effectively handle these complexities. Taken together, these findings\nhighlight the promising potential of GPT models in sentiment analysis, setting\nthe stage for future research in this field. The code can be found at\nhttps://github.com/DSAatUSU/SentimentGPT\n","authors":["Kiana Kheiri","Hamid Karimi"],"pdf_url":"https://arxiv.org/pdf/2307.10234v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12306v1","updated":"2023-07-23T12:18:12Z","published":"2023-07-23T12:18:12Z","title":"Tackling the Curse of Dimensionality with Physics-Informed Neural\n  Networks","summary":"  The curse-of-dimensionality (CoD) taxes computational resources heavily with\nexponentially increasing computational cost as the dimension increases. This\nposes great challenges in solving high-dimensional PDEs as Richard Bellman\nfirst pointed out over 60 years ago. While there has been some recent success\nin solving numerically partial differential equations (PDEs) in high\ndimensions, such computations are prohibitively expensive, and true scaling of\ngeneral nonlinear PDEs to high dimensions has never been achieved. In this\npaper, we develop a new method of scaling up physics-informed neural networks\n(PINNs) to solve arbitrary high-dimensional PDEs. The new method, called\nStochastic Dimension Gradient Descent (SDGD), decomposes a gradient of PDEs\ninto pieces corresponding to different dimensions and samples randomly a subset\nof these dimensional pieces in each iteration of training PINNs. We\ntheoretically prove the convergence guarantee and other desired properties of\nthe proposed method. We experimentally demonstrate that the proposed method\nallows us to solve many notoriously hard high-dimensional PDEs, including the\nHamilton-Jacobi-Bellman and the Schr\\\"{o}dinger equations in thousands of\ndimensions very fast on a single GPU using the PINNs mesh-free approach. For\nexample, we solve nontrivial nonlinear PDEs (the HJB-Lin equation and the BSB\nequation) in 100,000 dimensions in 6 hours on a single GPU using SDGD with\nPINNs. Since SDGD is a general training methodology of PINNs, SDGD can be\napplied to any current and future variants of PINNs to scale them up for\narbitrary high-dimensional PDEs.\n","authors":["Zheyuan Hu","Khemraj Shukla","George Em Karniadakis","Kenji Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2307.12306v1.pdf","comment":"32 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.12304v1","updated":"2023-07-23T12:12:44Z","published":"2023-07-23T12:12:44Z","title":"Physics-Informed Machine Learning of Argon Gas-Driven Melt Pool Dynamics","summary":"  Melt pool dynamics in metal additive manufacturing (AM) is critical to\nprocess stability, microstructure formation, and final properties of the\nprinted materials. Physics-based simulation including computational fluid\ndynamics (CFD) is the dominant approach to predict melt pool dynamics. However,\nthe physics-based simulation approaches suffer from the inherent issue of very\nhigh computational cost. This paper provides a physics-informed machine\nlearning (PIML) method by integrating neural networks with the governing\nphysical laws to predict the melt pool dynamics such as temperature, velocity,\nand pressure without using any training data on velocity. This approach avoids\nsolving the highly non-linear Navier-Stokes equation numerically, which\nsignificantly reduces the computational cost. The difficult-to-determine model\nconstants of the governing equations of the melt pool can also be inferred\nthrough data-driven discovery. In addition, the physics-informed neural network\n(PINN) architecture has been optimized for efficient model training. The\ndata-efficient PINN model is attributed to the soft penalty by incorporating\ngoverning partial differential equations (PDEs), initial conditions, and\nboundary conditions in the PINN model.\n","authors":["R. Sharma","W. Grace Guo","M. Raissi","Y. B. Guo"],"pdf_url":"https://arxiv.org/pdf/2307.12304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12301v1","updated":"2023-07-23T11:50:27Z","published":"2023-07-23T11:50:27Z","title":"RANSAC-NN: Unsupervised Image Outlier Detection using RANSAC","summary":"  Image outlier detection (OD) is crucial for ensuring the quality and accuracy\nof image datasets used in computer vision tasks. The majority of OD algorithms,\nhowever, have not been targeted toward image data. Consequently, the results of\napplying such algorithms to images are often suboptimal. In this work, we\npropose RANSAC-NN, a novel unsupervised OD algorithm specifically designed for\nimages. By comparing images in a RANSAC-based approach, our algorithm\nautomatically predicts the outlier score of each image without additional\ntraining or label information. We evaluate RANSAC-NN against state-of-the-art\nOD algorithms on 15 diverse datasets. Without any hyperparameter tuning,\nRANSAC-NN consistently performs favorably in contrast to other algorithms in\nalmost every dataset category. Furthermore, we provide a detailed analysis to\nunderstand each RANSAC-NN component, and we demonstrate its potential\napplications in image mislabeled detection. Code for RANSAC-NN is provided at\nhttps://github.com/mxtsai/ransac-nn\n","authors":["Chen-Han Tsai","Yu-Shao Peng"],"pdf_url":"https://arxiv.org/pdf/2307.12301v1.pdf","comment":"19 pages, 18 figures"},{"id":"http://arxiv.org/abs/2307.12296v1","updated":"2023-07-23T11:21:34Z","published":"2023-07-23T11:21:34Z","title":"Comparative analysis using classification methods versus early stage\n  diabetes","summary":"  In this research work, a comparative analysis was carried out using\nclassification methods such as: Discriminant Analysis and Logistic Regression\nto subsequently predict whether a person may have the presence of early stage\ndiabetes. For this purpose, use was made of a database of the UC IRVINE\nplatform of the year 2020 where specific variables that influence diabetes were\nused for a better result. Likewise in terms of methodology, the corresponding\nanalysis was performed for each of the 3 classification methods and then take\nthem to a comparative table and analyze the results obtained. Finally we can\nadd that the majority of the studies carried out applying the classification\nmethods to the diseases can be clearly seen that there is a certain attachment\nand more use of the logistic regression classification method, on the other\nhand, in the results we could see significant differences in terms of the 2\nclassification methods that were applied, which was valuable information for\nlater drawing final conclusions.\n","authors":["Alca-Vilca Gabriel Anthony","Carpio-Vargas Eloy"],"pdf_url":"https://arxiv.org/pdf/2307.12296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.06868v4","updated":"2023-07-23T10:36:55Z","published":"2022-08-14T15:25:41Z","title":"Frouros: A Python library for drift detection in machine learning\n  systems","summary":"  Frouros is an open-source Python library capable of detecting drift in\nmachine learning systems. It provides a combination of classical and more\nrecent algorithms for drift detection: both concept and data drift. We have\ndesigned it with the objective of making it compatible with any machine\nlearning framework and easily adaptable to real-world use cases. The library is\ndeveloped following a set of best development and continuous integration\npractices to ensure ease of maintenance and extensibility. The source code is\navailable at https://github.com/IFCA/frouros.\n","authors":["Jaime Céspedes-Sisniega","Álvaro López-García"],"pdf_url":"https://arxiv.org/pdf/2208.06868v4.pdf","comment":"11 pages, 1 table"},{"id":"http://arxiv.org/abs/2304.06833v2","updated":"2023-07-23T09:58:36Z","published":"2023-04-13T21:54:53Z","title":"Estimate-Then-Optimize versus Integrated-Estimation-Optimization versus\n  Sample Average Approximation: A Stochastic Dominance Perspective","summary":"  In data-driven stochastic optimization, model parameters of the underlying\ndistribution need to be estimated from data in addition to the optimization\ntask. Recent literature considers integrating the estimation and optimization\nprocesses by selecting model parameters that lead to the best empirical\nobjective performance. This integrated approach, which we call\nintegrated-estimation-optimization (IEO), can be readily shown to outperform\nsimple estimate-then-optimize (ETO) when the model is misspecified. In this\npaper, we show that a reverse behavior appears when the model class is\nwell-specified and there is sufficient data. Specifically, for a general class\nof nonlinear stochastic optimization problems, we show that simple ETO\noutperforms IEO asymptotically when the model class covers the ground truth, in\nthe strong sense of stochastic dominance of the regret. Namely, the entire\ndistribution of the regret, not only its mean or other moments, is always\nbetter for ETO compared to IEO. Our results also apply to constrained,\ncontextual optimization problems where the decision depends on observed\nfeatures. Whenever applicable, we also demonstrate how standard sample average\napproximation (SAA) performs the worst when the model class is well-specified\nin terms of regret, and best when it is misspecified. Finally, we provide\nexperimental results to support our theoretical comparisons and illustrate when\nour insights hold in finite-sample regimes and under various degrees of\nmisspecification.\n","authors":["Adam N. Elmachtoub","Henry Lam","Haofeng Zhang","Yunfan Zhao"],"pdf_url":"https://arxiv.org/pdf/2304.06833v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04492v8","updated":"2023-07-23T08:54:43Z","published":"2023-05-08T06:36:46Z","title":"MGR: Multi-generator Based Rationalization","summary":"  Rationalization is to employ a generator and a predictor to construct a\nself-explaining NLP model in which the generator selects a subset of\nhuman-intelligible pieces of the input text to the following predictor.\nHowever, rationalization suffers from two key challenges, i.e., spurious\ncorrelation and degeneration, where the predictor overfits the spurious or\nmeaningless pieces solely selected by the not-yet well-trained generator and in\nturn deteriorates the generator. Although many studies have been proposed to\naddress the two challenges, they are usually designed separately and do not\ntake both of them into account. In this paper, we propose a simple yet\neffective method named MGR to simultaneously solve the two problems. The key\nidea of MGR is to employ multiple generators such that the occurrence stability\nof real pieces is improved and more meaningful pieces are delivered to the\npredictor. Empirically, we show that MGR improves the F1 score by up to 20.9%\nas compared to state-of-the-art methods. Codes are available at\nhttps://github.com/jugechengzi/Rationalization-MGR .\n","authors":["Wei Liu","Haozhao Wang","Jun Wang","Ruixuan Li","Xinyang Li","Yuankai Zhang","Yang Qiu"],"pdf_url":"https://arxiv.org/pdf/2305.04492v8.pdf","comment":"ACL 2023, oral presentation. Fixed some typos and clarified some\n  implementation details. arXiv admin note: text overlap with arXiv:2209.08285"},{"id":"http://arxiv.org/abs/2307.06148v2","updated":"2023-07-23T08:02:30Z","published":"2023-07-12T13:10:08Z","title":"NetGPT: A Native-AI Network Architecture Beyond Provisioning\n  Personalized Generative Services","summary":"  Large language models (LLMs) have triggered tremendous success to empower\ndaily life by generative information, and the personalization of LLMs could\nfurther contribute to their applications due to better alignment with human\nintents. Towards personalized generative services, a collaborative cloud-edge\nmethodology sounds promising, as it facilitates the effective orchestration of\nheterogeneous distributed communication and computing resources. In this\narticle, after discussing the pros and cons of several candidate cloud-edge\ncollaboration techniques, we put forward NetGPT to capably deploy appropriate\nLLMs at the edge and the cloud in accordance with their computing capacity. In\naddition, edge LLMs could efficiently leverage location-based information for\npersonalized prompt completion, thus benefiting the interaction with cloud\nLLMs. After deploying representative open-source LLMs (e.g., GPT-2-base and\nLLaMA model) at the edge and the cloud, we present the feasibility of NetGPT on\nthe basis of low-rank adaptation-based light-weight fine-tuning. Subsequently,\nwe highlight substantial essential changes required for a native artificial\nintelligence (AI) network architecture towards NetGPT, with special emphasis on\ndeeper integration of communications and computing resources and careful\ncalibration of logical AI workflow. Furthermore, we demonstrate several\nby-product benefits of NetGPT, given edge LLM's astonishing capability to\npredict trends and infer intents, which possibly leads to a unified solution\nfor intelligent network management \\& orchestration. In a nutshell, we argue\nthat NetGPT is a promising native-AI network architecture beyond provisioning\npersonalized generative services.\n","authors":["Yuxuan Chen","Rongpeng Li","Zhifeng Zhao","Chenghui Peng","Jianjun Wu","Ekram Hossain","Honggang Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.06148v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12255v1","updated":"2023-07-23T08:02:27Z","published":"2023-07-23T08:02:27Z","title":"ResWCAE: Biometric Pattern Image Denoising Using Residual\n  Wavelet-Conditioned Autoencoder","summary":"  The utilization of biometric authentication with pattern images is\nincreasingly popular in compact Internet of Things (IoT) devices. However, the\nreliability of such systems can be compromised by image quality issues,\nparticularly in the presence of high levels of noise. While state-of-the-art\ndeep learning algorithms designed for generic image denoising have shown\npromise, their large number of parameters and lack of optimization for unique\nbiometric pattern retrieval make them unsuitable for these devices and\nscenarios. In response to these challenges, this paper proposes a lightweight\nand robust deep learning architecture, the Residual Wavelet-Conditioned\nConvolutional Autoencoder (Res-WCAE) with a Kullback-Leibler divergence (KLD)\nregularization, designed specifically for fingerprint image denoising. Res-WCAE\ncomprises two encoders - an image encoder and a wavelet encoder - and one\ndecoder. Residual connections between the image encoder and decoder are\nleveraged to preserve fine-grained spatial features, where the bottleneck layer\nconditioned on the compressed representation of features obtained from the\nwavelet encoder using approximation and detail subimages in the\nwavelet-transform domain. The effectiveness of Res-WCAE is evaluated against\nseveral state-of-the-art denoising methods, and the experimental results\ndemonstrate that Res-WCAE outperforms these methods, particularly for heavily\ndegraded fingerprint images in the presence of high levels of noise. Overall,\nRes-WCAE shows promise as a solution to the challenges faced by biometric\nauthentication systems in compact IoT devices.\n","authors":["Youzhi Liang","Wen Liang"],"pdf_url":"https://arxiv.org/pdf/2307.12255v1.pdf","comment":"8 pages, 2 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.04827v2","updated":"2023-07-23T10:20:28Z","published":"2023-07-07T16:25:59Z","title":"LaunchpadGPT: Language Model as Music Visualization Designer on\n  Launchpad","summary":"  Launchpad is a musical instrument that allows users to create and perform\nmusic by pressing illuminated buttons. To assist and inspire the design of the\nLaunchpad light effect, and provide a more accessible approach for beginners to\ncreate music visualization with this instrument, we proposed the LaunchpadGPT\nmodel to generate music visualization designs on Launchpad automatically. Based\non the language model with excellent generation ability, our proposed\nLaunchpadGPT takes an audio piece of music as input and outputs the lighting\neffects of Launchpad-playing in the form of a video (Launchpad-playing video).\nWe collect Launchpad-playing videos and process them to obtain music and\ncorresponding video frame of Launchpad-playing as prompt-completion pairs, to\ntrain the language model. The experiment result shows the proposed method can\ncreate better music visualization than random generation methods and hold the\npotential for a broader range of music visualization applications. Our code is\navailable at https://github.com/yunlong10/LaunchpadGPT/.\n","authors":["Siting Xu","Yunlong Tang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.04827v2.pdf","comment":"Accepted by International Computer Music Conference (ICMC) 2023"},{"id":"http://arxiv.org/abs/2306.09635v2","updated":"2023-07-23T07:53:42Z","published":"2023-06-16T05:42:01Z","title":"CLIPSonic: Text-to-Audio Synthesis with Unlabeled Videos and Pretrained\n  Language-Vision Models","summary":"  Recent work has studied text-to-audio synthesis using large amounts of paired\ntext-audio data. However, audio recordings with high-quality text annotations\ncan be difficult to acquire. In this work, we approach text-to-audio synthesis\nusing unlabeled videos and pretrained language-vision models. We propose to\nlearn the desired text-audio correspondence by leveraging the visual modality\nas a bridge. We train a conditional diffusion model to generate the audio track\nof a video, given a video frame encoded by a pretrained contrastive\nlanguage-image pretraining (CLIP) model. At test time, we first explore\nperforming a zero-shot modality transfer and condition the diffusion model with\na CLIP-encoded text query. However, we observe a noticeable performance drop\nwith respect to image queries. To close this gap, we further adopt a pretrained\ndiffusion prior model to generate a CLIP image embedding given a CLIP text\nembedding. Our results show the effectiveness of the proposed method, and that\nthe pretrained diffusion prior can reduce the modality transfer gap. While we\nfocus on text-to-audio synthesis, the proposed model can also generate audio\nfrom image queries, and it shows competitive performance against a\nstate-of-the-art image-to-audio synthesis model in a subjective listening test.\nThis study offers a new direction of approaching text-to-audio synthesis that\nleverages the naturally-occurring audio-visual correspondence in videos and the\npower of pretrained language-vision models.\n","authors":["Hao-Wen Dong","Xiaoyu Liu","Jordi Pons","Gautam Bhattacharya","Santiago Pascual","Joan Serrà","Taylor Berg-Kirkpatrick","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2306.09635v2.pdf","comment":"Accepted by WASPAA 2023. Demo:\n  https://salu133445.github.io/clipsonic/"}]},"2023-07-25T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.13693v1","updated":"2023-07-25T17:57:18Z","published":"2023-07-25T17:57:18Z","title":"Evaluating Large Language Models for Radiology Natural Language\n  Processing","summary":"  The rise of large language models (LLMs) has marked a pivotal shift in the\nfield of natural language processing (NLP). LLMs have revolutionized a\nmultitude of domains, and they have made a significant impact in the medical\nfield. Large language models are now more abundant than ever, and many of these\nmodels exhibit bilingual capabilities, proficient in both English and Chinese.\nHowever, a comprehensive evaluation of these models remains to be conducted.\nThis lack of assessment is especially apparent within the context of radiology\nNLP. This study seeks to bridge this gap by critically evaluating thirty two\nLLMs in interpreting radiology reports, a crucial component of radiology NLP.\nSpecifically, the ability to derive impressions from radiologic findings is\nassessed. The outcomes of this evaluation provide key insights into the\nperformance, strengths, and weaknesses of these LLMs, informing their practical\napplications within the medical domain.\n","authors":["Zhengliang Liu","Tianyang Zhong","Yiwei Li","Yutong Zhang","Yi Pan","Zihao Zhao","Peixin Dong","Chao Cao","Yuxiao Liu","Peng Shu","Yaonai Wei","Zihao Wu","Chong Ma","Jiaqi Wang","Sheng Wang","Mengyue Zhou","Zuowei Jiang","Chunlin Li","Shaochen Xu","Lu Zhang","Haixing Dai","Kai Zhang","Xu Liu","Lin Zhao","Peilong Wang","Pingkun Yan","Jun Liu","Bao Ge","Lichao Sun","Dajiang Zhu","Xiang Li","Wei Liu","Xiaoyan Cai","Xintao Hu","Xi Jiang","Shu Zhang","Xin Zhang","Tuo Zhang","Shijie Zhao","Quanzheng Li","Hongtu Zhu","Dinggang Shen","Tianming Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13692v1","updated":"2023-07-25T17:55:19Z","published":"2023-07-25T17:55:19Z","title":"ARB: Advanced Reasoning Benchmark for Large Language Models","summary":"  Large Language Models (LLMs) have demonstrated remarkable performance on\nvarious quantitative reasoning and knowledge benchmarks. However, many of these\nbenchmarks are losing utility as LLMs get increasingly high scores, despite not\nyet reaching expert performance in these domains. We introduce ARB, a novel\nbenchmark composed of advanced reasoning problems in multiple fields. ARB\npresents a more challenging test than prior benchmarks, featuring problems in\nmathematics, physics, biology, chemistry, and law. As a subset of ARB, we\nintroduce a challenging set of math and physics problems which require advanced\nsymbolic reasoning and domain knowledge. We evaluate recent models such as\nGPT-4 and Claude on ARB and demonstrate that current models score well below\n50% on more demanding tasks. In order to improve both automatic and assisted\nevaluation capabilities, we introduce a rubric-based evaluation approach,\nallowing GPT-4 to score its own intermediate reasoning steps. Further, we\nconduct a human evaluation of the symbolic subset of ARB, finding promising\nagreement between annotators and GPT-4 rubric evaluation scores.\n","authors":["Tomohiro Sawada","Daniel Paleka","Alexander Havrilla","Pranav Tadepalli","Paula Vidas","Alexander Kranias","John J. Nay","Kshitij Gupta","Aran Komatsuzaki"],"pdf_url":"https://arxiv.org/pdf/2307.13692v1.pdf","comment":"Submitted to NeurIPS Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2303.06296v2","updated":"2023-07-25T17:42:37Z","published":"2023-03-11T03:30:47Z","title":"Stabilizing Transformer Training by Preventing Attention Entropy\n  Collapse","summary":"  Training stability is of great importance to Transformers. In this work, we\ninvestigate the training dynamics of Transformers by examining the evolution of\nthe attention layers. In particular, we track the attention entropy for each\nattention head during the course of training, which is a proxy for model\nsharpness. We identify a common pattern across different architectures and\ntasks, where low attention entropy is accompanied by high training instability,\nwhich can take the form of oscillating loss or divergence. We denote the\npathologically low attention entropy, corresponding to highly concentrated\nattention scores, as $\\textit{entropy collapse}$. As a remedy, we propose\n$\\sigma$Reparam, a simple and efficient solution where we reparametrize all\nlinear layers with spectral normalization and an additional learned scalar. We\ndemonstrate that $\\sigma$Reparam successfully prevents entropy collapse in the\nattention layers, promoting more stable training. Additionally, we prove a\ntight lower bound of the attention entropy, which decreases exponentially fast\nwith the spectral norm of the attention logits, providing additional motivation\nfor our approach. We conduct experiments with $\\sigma$Reparam on image\nclassification, image self-supervised learning, machine translation, speech\nrecognition, and language modeling tasks. We show that $\\sigma$Reparam provides\nstability and robustness with respect to the choice of hyperparameters, going\nso far as enabling training (a) a Vision Transformer {to competitive\nperformance} without warmup, weight decay, layer normalization or adaptive\noptimizers; (b) deep architectures in machine translation and (c) speech\nrecognition to competitive performance without warmup and adaptive optimizers.\nCode is available at \\url{https://github.com/apple/ml-sigma-reparam}.\n","authors":["Shuangfei Zhai","Tatiana Likhomanenko","Etai Littwin","Dan Busbridge","Jason Ramapuram","Yizhe Zhang","Jiatao Gu","Josh Susskind"],"pdf_url":"https://arxiv.org/pdf/2303.06296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13655v1","updated":"2023-07-25T17:02:38Z","published":"2023-07-25T17:02:38Z","title":"A Comprehensive Evaluation and Analysis Study for Chinese Spelling Check","summary":"  With the development of pre-trained models and the incorporation of phonetic\nand graphic information, neural models have achieved high scores in Chinese\nSpelling Check (CSC). However, it does not provide a comprehensive reflection\nof the models' capability due to the limited test sets. In this study, we\nabstract the representative model paradigm, implement it with nine structures\nand experiment them on comprehensive test sets we constructed with different\npurposes. We perform a detailed analysis of the results and find that: 1)\nFusing phonetic and graphic information reasonably is effective for CSC. 2)\nModels are sensitive to the error distribution of the test set, which reflects\nthe shortcomings of models and reveals the direction we should work on. 3)\nWhether or not the errors and contexts have been seen has a significant impact\non models. 4) The commonly used benchmark, SIGHAN, can not reliably evaluate\nmodels' performance.\n","authors":["Xunjian Yin","Xiaojun Wan"],"pdf_url":"https://arxiv.org/pdf/2307.13655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13631v1","updated":"2023-07-25T16:31:20Z","published":"2023-07-25T16:31:20Z","title":"Contributions to the Improvement of Question Answering Systems in the\n  Biomedical Domain","summary":"  This thesis work falls within the framework of question answering (QA) in the\nbiomedical domain where several specific challenges are addressed, such as\nspecialized lexicons and terminologies, the types of treated questions, and the\ncharacteristics of targeted documents. We are particularly interested in\nstudying and improving methods that aim at finding accurate and short answers\nto biomedical natural language questions from a large scale of biomedical\ntextual documents in English. QA aims at providing inquirers with direct, short\nand precise answers to their natural language questions. In this Ph.D. thesis,\nwe propose four contributions to improve the performance of QA in the\nbiomedical domain. In our first contribution, we propose a machine\nlearning-based method for question type classification to determine the types\nof given questions which enable to a biomedical QA system to use the\nappropriate answer extraction method. We also propose an another machine\nlearning-based method to assign one or more topics (e.g., pharmacological,\ntest, treatment, etc.) to given questions in order to determine the semantic\ntypes of the expected answers which are very useful in generating specific\nanswer retrieval strategies. In the second contribution, we first propose a\ndocument retrieval method to retrieve a set of relevant documents that are\nlikely to contain the answers to biomedical questions from the MEDLINE\ndatabase. We then present a passage retrieval method to retrieve a set of\nrelevant passages to questions. In the third contribution, we propose specific\nanswer extraction methods to generate both exact and ideal answers. Finally, in\nthe fourth contribution, we develop a fully automated semantic biomedical QA\nsystem called SemBioNLQA which is able to deal with a variety of natural\nlanguage questions and to generate appropriate answers by providing both exact\nand ideal answers.\n","authors":["Mourad Sarrouti"],"pdf_url":"https://arxiv.org/pdf/2307.13631v1.pdf","comment":"Doctoral thesis"},{"id":"http://arxiv.org/abs/2307.13617v1","updated":"2023-07-25T16:21:07Z","published":"2023-07-25T16:21:07Z","title":"GPT-3 Models are Few-Shot Financial Reasoners","summary":"  Financial analysis is an important tool for evaluating company performance.\nPractitioners work to answer financial questions to make profitable investment\ndecisions, and use advanced quantitative analyses to do so. As a result,\nFinancial Question Answering (QA) is a question answering task that requires\ndeep reasoning about numbers. Furthermore, it is unknown how well pre-trained\nlanguage models can reason in the financial domain. The current\nstate-of-the-art requires a retriever to collect relevant facts about the\nfinancial question from the text and a generator to produce a valid financial\nprogram and a final answer. However, recently large language models like GPT-3\nhave achieved state-of-the-art performance on wide variety of tasks with just a\nfew shot examples. We run several experiments with GPT-3 and find that a\nseparate retrieval model and logic engine continue to be essential components\nto achieving SOTA performance in this task, particularly due to the precise\nnature of financial questions and the complex information stored in financial\ndocuments. With this understanding, our refined prompt-engineering approach on\nGPT-3 achieves near SOTA accuracy without any fine-tuning.\n","authors":["Raul Salles de Padua","Imran Qureshi","Mustafa U. Karakaplan"],"pdf_url":"https://arxiv.org/pdf/2307.13617v1.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.13560v1","updated":"2023-07-25T15:08:34Z","published":"2023-07-25T15:08:34Z","title":"XDLM: Cross-lingual Diffusion Language Model for Machine Translation","summary":"  Recently, diffusion models have excelled in image generation tasks and have\nalso been applied to neural language processing (NLP) for controllable text\ngeneration. However, the application of diffusion models in a cross-lingual\nsetting is less unexplored. Additionally, while pretraining with diffusion\nmodels has been studied within a single language, the potential of\ncross-lingual pretraining remains understudied. To address these gaps, we\npropose XDLM, a novel Cross-lingual diffusion model for machine translation,\nconsisting of pretraining and fine-tuning stages. In the pretraining stage, we\npropose TLDM, a new training objective for mastering the mapping between\ndifferent languages; in the fine-tuning stage, we build up the translation\nsystem based on the pretrained model. We evaluate the result on several machine\ntranslation benchmarks and outperformed both diffusion and Transformer\nbaselines.\n","authors":["Linyao Chen","Aosong Feng","Boming Yang","Zihui Li"],"pdf_url":"https://arxiv.org/pdf/2307.13560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08303v2","updated":"2023-07-25T14:57:05Z","published":"2023-07-17T07:55:47Z","title":"Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language\n  Models","summary":"  Dense retrieval (DR) converts queries and documents into dense embeddings and\nmeasures the similarity between queries and documents in vector space. One of\nthe challenges in DR is the lack of domain-specific training data. While DR\nmodels can learn from large-scale public datasets like MS MARCO through\ntransfer learning, evidence shows that not all DR models and domains can\nbenefit from transfer learning equally. Recently, some researchers have\nresorted to large language models (LLMs) to improve the zero-shot and few-shot\nDR models. However, the hard prompts or human-written prompts utilized in these\nworks cannot guarantee the good quality of generated weak queries. To tackle\nthis, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,\nwe leverage soft prompt-tuning to optimize a task-specific soft prompt on\nlimited ground truth data and then prompt the LLMs to tag unlabeled documents\nwith weak queries, yielding enough weak document-query pairs to train\ntask-specific dense retrievers. We design a filter to select high-quality\nexample document-query pairs in the prompt to further improve the quality of\nweak tagged queries. To the best of our knowledge, there is no prior work\nutilizing soft prompt tuning to augment DR models. The experiments demonstrate\nthat SPTAR outperforms the unsupervised baselines BM25 and the recently\nproposed LLMs-based augmentation method for DR.\n","authors":["Zhiyuan Peng","Xuyang Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2307.08303v2.pdf","comment":"fix typo InPairs which should be InPars"},{"id":"http://arxiv.org/abs/2207.08012v2","updated":"2023-07-25T14:30:20Z","published":"2022-07-16T20:37:46Z","title":"Meta-Referential Games to Learn Compositional Learning Behaviours","summary":"  Human beings use compositionality to generalise from past experiences to\nnovel experiences. We assume a separation of our experiences into fundamental\natomic components that can be recombined in novel ways to support our ability\nto engage with novel experiences. We frame this as the ability to learn to\ngeneralise compositionally, and we will refer to behaviours making use of this\nability as compositional learning behaviours (CLBs). A central problem to\nlearning CLBs is the resolution of a binding problem (BP). While it is another\nfeat of intelligence that human beings perform with ease, it is not the case\nfor state-of-the-art artificial agents. Thus, in order to build artificial\nagents able to collaborate with human beings, we propose to develop a novel\nbenchmark to investigate agents' abilities to exhibit CLBs by solving a\ndomain-agnostic version of the BP. We take inspiration from the language\nemergence and grounding framework of referential games and propose a\nmeta-learning extension of referential games, entitled Meta-Referential Games,\nand use this framework to build our benchmark, that we name Symbolic Behaviour\nBenchmark (S2B). We provide baseline results showing that our benchmark is a\ncompelling challenge that we hope will spur the research community towards\ndeveloping more capable artificial agents.\n","authors":["Kevin Denamganaï","Sondess Missaoui","James Alfred Walker"],"pdf_url":"https://arxiv.org/pdf/2207.08012v2.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2307.13528v1","updated":"2023-07-25T14:20:51Z","published":"2023-07-25T14:20:51Z","title":"FacTool: Factuality Detection in Generative AI -- A Tool Augmented\n  Framework for Multi-Task and Multi-Domain Scenarios","summary":"  The emergence of generative pre-trained models has facilitated the synthesis\nof high-quality text, but it has also posed challenges in identifying factual\nerrors in the generated text. In particular: (1) A wider range of tasks now\nface an increasing risk of containing factual errors when handled by generative\nmodels. (2) Generated texts tend to be lengthy and lack a clearly defined\ngranularity for individual facts. (3) There is a scarcity of explicit evidence\navailable during the process of fact checking. With the above challenges in\nmind, in this paper, we propose FacTool, a task and domain agnostic framework\nfor detecting factual errors of texts generated by large language models (e.g.,\nChatGPT). Experiments on four different tasks (knowledge-based QA, code\ngeneration, mathematical reasoning, and scientific literature review) show the\nefficacy of the proposed method.\n","authors":["I-Chun Chern","Steffi Chern","Shiqi Chen","Weizhe Yuan","Kehua Feng","Chunting Zhou","Junxian He","Graham Neubig","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14108v4","updated":"2023-07-25T14:07:03Z","published":"2023-04-27T11:37:18Z","title":"DataComp: In search of the next generation of multimodal datasets","summary":"  Multimodal datasets are a critical component in recent breakthroughs such as\nStable Diffusion and GPT-4, yet their design does not receive the same research\nattention as model architectures or training algorithms. To address this\nshortcoming in the ML ecosystem, we introduce DataComp, a testbed for dataset\nexperiments centered around a new candidate pool of 12.8 billion image-text\npairs from Common Crawl. Participants in our benchmark design new filtering\ntechniques or curate new data sources and then evaluate their new dataset by\nrunning our standardized CLIP training code and testing the resulting model on\n38 downstream test sets. Our benchmark consists of multiple compute scales\nspanning four orders of magnitude, which enables the study of scaling trends\nand makes the benchmark accessible to researchers with varying resources. Our\nbaseline experiments show that the DataComp workflow leads to better training\nsets. In particular, our best baseline, DataComp-1B, enables training a CLIP\nViT-L/14 from scratch to 79.2% zero-shot accuracy on ImageNet, outperforming\nOpenAI's CLIP ViT-L/14 by 3.7 percentage points while using the same training\nprocedure and compute. We release DataComp and all accompanying code at\nwww.datacomp.ai.\n","authors":["Samir Yitzhak Gadre","Gabriel Ilharco","Alex Fang","Jonathan Hayase","Georgios Smyrnis","Thao Nguyen","Ryan Marten","Mitchell Wortsman","Dhruba Ghosh","Jieyu Zhang","Eyal Orgad","Rahim Entezari","Giannis Daras","Sarah Pratt","Vivek Ramanujan","Yonatan Bitton","Kalyani Marathe","Stephen Mussmann","Richard Vencu","Mehdi Cherti","Ranjay Krishna","Pang Wei Koh","Olga Saukh","Alexander Ratner","Shuran Song","Hannaneh Hajishirzi","Ali Farhadi","Romain Beaumont","Sewoong Oh","Alex Dimakis","Jenia Jitsev","Yair Carmon","Vaishaal Shankar","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2304.14108v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13497v1","updated":"2023-07-25T13:46:36Z","published":"2023-07-25T13:46:36Z","title":"Zshot: An Open-source Framework for Zero-Shot Named Entity Recognition\n  and Relation Extraction","summary":"  The Zero-Shot Learning (ZSL) task pertains to the identification of entities\nor relations in texts that were not seen during training. ZSL has emerged as a\ncritical research area due to the scarcity of labeled data in specific domains,\nand its applications have grown significantly in recent years. With the advent\nof large pretrained language models, several novel methods have been proposed,\nresulting in substantial improvements in ZSL performance. There is a growing\ndemand, both in the research community and industry, for a comprehensive ZSL\nframework that facilitates the development and accessibility of the latest\nmethods and pretrained models.In this study, we propose a novel ZSL framework\ncalled Zshot that aims to address the aforementioned challenges. Our primary\nobjective is to provide a platform that allows researchers to compare different\nstate-of-the-art ZSL methods with standard benchmark datasets. Additionally, we\nhave designed our framework to support the industry with readily available APIs\nfor production under the standard SpaCy NLP pipeline. Our API is extendible and\nevaluable, moreover, we include numerous enhancements such as boosting the\naccuracy with pipeline ensembling and visualization utilities available as a\nSpaCy extension.\n","authors":["Gabriele Picco","Marcos Martínez Galindo","Alberto Purpura","Leopold Fuchs","Vanessa López","Hoang Thanh Lam"],"pdf_url":"https://arxiv.org/pdf/2307.13497v1.pdf","comment":"Accepted at ACL 2023"},{"id":"http://arxiv.org/abs/2210.10332v3","updated":"2023-07-25T13:02:49Z","published":"2022-10-19T07:05:06Z","title":"Revision Transformers: Instructing Language Models to Change their\n  Values","summary":"  Current transformer language models (LM) are large-scale models with billions\nof parameters. They have been shown to provide high performances on a variety\nof tasks but are also prone to shortcut learning and bias. Addressing such\nincorrect model behavior via parameter adjustments is very costly. This is\nparticularly problematic for updating dynamic concepts, such as moral values,\nwhich vary culturally or interpersonally. In this work, we question the current\ncommon practice of storing all information in the model parameters and propose\nthe Revision Transformer (RiT) to facilitate easy model updating. The specific\ncombination of a large-scale pre-trained LM that inherently but also diffusely\nencodes world knowledge with a clear-structured revision engine makes it\npossible to update the model's knowledge with little effort and the help of\nuser interaction. We exemplify RiT on a moral dataset and simulate user\nfeedback demonstrating strong performance in model revision even with small\ndata. This way, users can easily design a model regarding their preferences,\npaving the way for more transparent AI models.\n","authors":["Felix Friedrich","Wolfgang Stammer","Patrick Schramowski","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2210.10332v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11993v2","updated":"2023-07-25T11:50:48Z","published":"2023-05-19T20:36:21Z","title":"Interpretable Word Sense Representations via Definition Generation: The\n  Case of Semantic Change Analysis","summary":"  We propose using automatically generated natural language definitions of\ncontextualised word usages as interpretable word and word sense\nrepresentations. Given a collection of usage examples for a target word, and\nthe corresponding data-driven usage clusters (i.e., word senses), a definition\nis generated for each usage with a specialised Flan-T5 language model, and the\nmost prototypical definition in a usage cluster is chosen as the sense label.\n  We demonstrate how the resulting sense labels can make existing approaches to\nsemantic change analysis more interpretable, and how they can allow users --\nhistorical linguists, lexicographers, or social scientists -- to explore and\nintuitively explain diachronic trajectories of word meaning. Semantic change\nanalysis is only one of many possible applications of the `definitions as\nrepresentations' paradigm. Beyond being human-readable, contextualised\ndefinitions also outperform token or usage sentence embeddings in\nword-in-context semantic similarity judgements, making them a new promising\ntype of lexical representation for NLP.\n","authors":["Mario Giulianelli","Iris Luden","Raquel Fernandez","Andrey Kutuzov"],"pdf_url":"https://arxiv.org/pdf/2305.11993v2.pdf","comment":"ACL 2023"},{"id":"http://arxiv.org/abs/2307.13424v1","updated":"2023-07-25T11:44:28Z","published":"2023-07-25T11:44:28Z","title":"Holistic Exploration on Universal Decompositional Semantic Parsing:\n  Architecture, Data Augmentation, and LLM Paradigm","summary":"  In this paper, we conduct a holistic exploration of the Universal\nDecompositional Semantic (UDS) Parsing. We first introduce a cascade model for\nUDS parsing that decomposes the complex parsing task into semantically\nappropriate subtasks. Our approach outperforms the prior models, while\nsignificantly reducing inference time. We also incorporate syntactic\ninformation and further optimized the architecture. Besides, different ways for\ndata augmentation are explored, which further improve the UDS Parsing. Lastly,\nwe conduct experiments to investigate the efficacy of ChatGPT in handling the\nUDS task, revealing that it excels in attribute parsing but struggles in\nrelation parsing, and using ChatGPT for data augmentation yields suboptimal\nresults. Our code is available at https://github.com/hexuandeng/HExp4UDS.\n","authors":["Hexuan Deng","Xin Zhang","Meishan Zhang","Xuebo Liu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.13424v1.pdf","comment":"12 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2307.13417v1","updated":"2023-07-25T11:29:55Z","published":"2023-07-25T11:29:55Z","title":"Towards Resolving Word Ambiguity with Word Embeddings","summary":"  Ambiguity is ubiquitous in natural language. Resolving ambiguous meanings is\nespecially important in information retrieval tasks. While word embeddings\ncarry semantic information, they fail to handle ambiguity well. Transformer\nmodels have been shown to handle word ambiguity for complex queries, but they\ncannot be used to identify ambiguous words, e.g. for a 1-word query.\nFurthermore, training these models is costly in terms of time, hardware\nresources, and training data, prohibiting their use in specialized environments\nwith sensitive data. Word embeddings can be trained using moderate hardware\nresources. This paper shows that applying DBSCAN clustering to the latent space\ncan identify ambiguous words and evaluate their level of ambiguity. An\nautomatic DBSCAN parameter selection leads to high-quality clusters, which are\nsemantically coherent and correspond well to the perceived meanings of a given\nword.\n","authors":["Matthias Thurnbauer","Johannes Reisinger","Christoph Goller","Andreas Fischer"],"pdf_url":"https://arxiv.org/pdf/2307.13417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13405v1","updated":"2023-07-25T10:53:20Z","published":"2023-07-25T10:53:20Z","title":"Towards Bridging the Digital Language Divide","summary":"  It is a well-known fact that current AI-based language technology -- language\nmodels, machine translation systems, multilingual dictionaries and corpora --\nfocuses on the world's 2-3% most widely spoken languages. Recent research\nefforts have attempted to expand the coverage of AI technology to\n`under-resourced languages.' The goal of our paper is to bring attention to a\nphenomenon that we call linguistic bias: multilingual language processing\nsystems often exhibit a hardwired, yet usually involuntary and hidden\nrepresentational preference towards certain languages. Linguistic bias is\nmanifested in uneven per-language performance even in the case of similar test\nconditions. We show that biased technology is often the result of research and\ndevelopment methodologies that do not do justice to the complexity of the\nlanguages being represented, and that can even become ethically problematic as\nthey disregard valuable aspects of diversity as well as the needs of the\nlanguage communities themselves. As our attempt at building diversity-aware\nlanguage resources, we present a new initiative that aims at reducing\nlinguistic bias through both technological design and methodology, based on an\neye-level collaboration with local communities.\n","authors":["Gábor Bella","Paula Helm","Gertraud Koch","Fausto Giunchiglia"],"pdf_url":"https://arxiv.org/pdf/2307.13405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16143v2","updated":"2023-07-25T10:40:12Z","published":"2023-06-28T12:17:45Z","title":"Generative User-Experience Research for Developing Domain-specific\n  Natural Language Processing Applications","summary":"  User experience (UX) is a part of human-computer interaction (HCI) research\nand focuses on increasing intuitiveness, transparency, simplicity, and trust\nfor system users. Most of the UX research for machine learning (ML) or natural\nlanguage processing (NLP) focuses on a data-driven methodology, i.e., it fails\nto focus on users' requirements, and engages domain users mainly for usability\nevaluation. Moreover, more typical UX methods tailor the systems towards user\nusability, unlike learning about the user needs first. The paper proposes a\nmethodology for integrating generative UX research into developing domain NLP\napplications. Generative UX research employs domain users at the initial stages\nof prototype development, i.e., ideation and concept evaluation, and the last\nstage for evaluating the change in user value. In the case study, we report the\nfull-cycle prototype development of a domain-specific semantic search for daily\noperations in the process industry. Our case study shows that involving domain\nexperts increases their interest and trust in the final NLP application.\nMoreover, we show that synergetic UX+NLP research efficiently considers data-\nand user-driven opportunities and constraints, which can be crucial for NLP\napplications in narrow domains\n","authors":["Anastasia Zhukova","Lukas von Sperl","Christian E. Matt","Bela Gipp"],"pdf_url":"https://arxiv.org/pdf/2306.16143v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13377v1","updated":"2023-07-25T09:51:17Z","published":"2023-07-25T09:51:17Z","title":"Embedding Models for Supervised Automatic Extraction and Classification\n  of Named Entities in Scientific Acknowledgements","summary":"  Acknowledgments in scientific papers may give an insight into aspects of the\nscientific community, such as reward systems, collaboration patterns, and\nhidden research trends. The aim of the paper is to evaluate the performance of\ndifferent embedding models for the task of automatic extraction and\nclassification of acknowledged entities from the acknowledgment text in\nscientific papers. We trained and implemented a named entity recognition (NER)\ntask using the Flair NLP framework. The training was conducted using three\ndefault Flair NER models with four differently-sized corpora and different\nversions of the Flair NLP framework. The Flair Embeddings model trained on the\nmedium corpus with the latest FLAIR version showed the best accuracy of 0.79.\nExpanding the size of a training corpus from very small to medium size\nmassively increased the accuracy of all training algorithms, but further\nexpansion of the training corpus did not bring further improvement. Moreover,\nthe performance of the model slightly deteriorated. Our model is able to\nrecognize six entity types: funding agency, grant number, individuals,\nuniversity, corporation, and miscellaneous. The model works more precisely for\nsome entity types than for others; thus, individuals and grant numbers showed a\nvery good F1-Score over 0.9. Most of the previous works on acknowledgment\nanalysis were limited by the manual evaluation of data and therefore by the\namount of processed data. This model can be applied for the comprehensive\nanalysis of acknowledgment texts and may potentially make a great contribution\nto the field of automated acknowledgment analysis.\n","authors":["Nina Smirnova","Philipp Mayr"],"pdf_url":"https://arxiv.org/pdf/2307.13377v1.pdf","comment":"The present paper is an extended version of the article Evaluation of\n  Embedding Models for Automatic Extraction and Classification of Acknowledged\n  Entities in Scientific Documents (Smirnova and Mayr, 2022) presented at the\n  3rd Workshop on Extraction and Evaluation of Knowledge Entities from\n  Scientific Documents (EEKE2022). arXiv admin note: substantial text overlap\n  with arXiv:2206.10939"},{"id":"http://arxiv.org/abs/2307.12896v2","updated":"2023-07-25T09:47:54Z","published":"2023-07-24T15:44:23Z","title":"Corrections of Zipf's and Heaps' Laws Derived from Hapax Rate Models","summary":"  The article introduces corrections to Zipf's and Heaps' laws based on\nsystematic models of the hapax rate. The derivation rests on two assumptions:\nThe first one is the standard urn model which predicts that marginal frequency\ndistributions for shorter texts look as if word tokens were sampled blindly\nfrom a given longer text. The second assumption posits that the rate of hapaxes\nis a simple function of the text size. Four such functions are discussed: the\nconstant model, the Davis model, the linear model, and the logistic model. It\nis shown that the logistic model yields the best fit.\n","authors":["Łukasz Dębowski"],"pdf_url":"https://arxiv.org/pdf/2307.12896v2.pdf","comment":"42 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2307.13365v1","updated":"2023-07-25T09:34:42Z","published":"2023-07-25T09:34:42Z","title":"Empower Your Model with Longer and Better Context Comprehension","summary":"  Recently, with the emergence of numerous Large Language Models (LLMs), the\nimplementation of AI has entered a new era. Irrespective of these models' own\ncapacity and structure, there is a growing demand for LLMs to possess enhanced\ncomprehension of longer and more complex contexts with relatively smaller\nsizes. Models often encounter an upper limit when processing sequences of\nsentences that extend beyond their comprehension capacity and result in\noff-topic or even chaotic responses. While several recent works attempt to\naddress this issue in various ways, they rarely focus on \"why models are unable\nto compensate or strengthen their capabilities on their own\". In this paper, we\nthoroughly investigate the nature of information transfer within LLMs and\npropose a novel technique called Attention Transition. This technique empowers\nmodels to achieve longer and better context comprehension with minimal\nadditional training or impact on generation fluency. Our experiments are\nconducted in XSum and achieve substantial improvement compared with the\noriginal generation results.\n","authors":["Yifei Gao","Lei Wang","Jun Fang","Longhua Hu","Jun Cheng"],"pdf_url":"https://arxiv.org/pdf/2307.13365v1.pdf","comment":"LLM for long context comprehension"},{"id":"http://arxiv.org/abs/2307.13339v1","updated":"2023-07-25T08:51:30Z","published":"2023-07-25T08:51:30Z","title":"Analyzing Chain-of-Thought Prompting in Large Language Models via\n  Gradient-based Feature Attributions","summary":"  Chain-of-thought (CoT) prompting has been shown to empirically improve the\naccuracy of large language models (LLMs) on various question answering tasks.\nWhile understanding why CoT prompting is effective is crucial to ensuring that\nthis phenomenon is a consequence of desired model behavior, little work has\naddressed this; nonetheless, such an understanding is a critical prerequisite\nfor responsible model deployment. We address this question by leveraging\ngradient-based feature attribution methods which produce saliency scores that\ncapture the influence of input tokens on model output. Specifically, we probe\nseveral open-source LLMs to investigate whether CoT prompting affects the\nrelative importances they assign to particular input tokens. Our results\nindicate that while CoT prompting does not increase the magnitude of saliency\nscores attributed to semantically relevant tokens in the prompt compared to\nstandard few-shot prompting, it increases the robustness of saliency scores to\nquestion perturbations and variations in model output.\n","authors":["Skyler Wu","Eric Meng Shen","Charumathi Badrinath","Jiaqi Ma","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2307.13339v1.pdf","comment":"Accepted to Workshop on Challenges in Deployable Generative AI at\n  ICML 2023"},{"id":"http://arxiv.org/abs/2307.13298v1","updated":"2023-07-25T07:27:32Z","published":"2023-07-25T07:27:32Z","title":"An Intent Taxonomy of Legal Case Retrieval","summary":"  Legal case retrieval is a special Information Retrieval~(IR) task focusing on\nlegal case documents. Depending on the downstream tasks of the retrieved case\ndocuments, users' information needs in legal case retrieval could be\nsignificantly different from those in Web search and traditional ad-hoc\nretrieval tasks. While there are several studies that retrieve legal cases\nbased on text similarity, the underlying search intents of legal retrieval\nusers, as shown in this paper, are more complicated than that yet mostly\nunexplored. To this end, we present a novel hierarchical intent taxonomy of\nlegal case retrieval. It consists of five intent types categorized by three\ncriteria, i.e., search for Particular Case(s), Characterization, Penalty,\nProcedure, and Interest. The taxonomy was constructed transparently and\nevaluated extensively through interviews, editorial user studies, and query log\nanalysis. Through a laboratory user study, we reveal significant differences in\nuser behavior and satisfaction under different search intents in legal case\nretrieval. Furthermore, we apply the proposed taxonomy to various downstream\nlegal retrieval tasks, e.g., result ranking and satisfaction prediction, and\ndemonstrate its effectiveness. Our work provides important insights into the\nunderstanding of user intents in legal case retrieval and potentially leads to\nbetter retrieval techniques in the legal domain, such as intent-aware ranking\nstrategies and evaluation methodologies.\n","authors":["Yunqiu Shao","Haitao Li","Yueyue Wu","Yiqun Liu","Qingyao Ai","Jiaxin Mao","Yixiao Ma","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2307.13298v1.pdf","comment":"28 pages, work in process"},{"id":"http://arxiv.org/abs/2307.08621v3","updated":"2023-07-25T06:47:43Z","published":"2023-07-17T16:40:01Z","title":"Retentive Network: A Successor to Transformer for Large Language Models","summary":"  In this work, we propose Retentive Network (RetNet) as a foundation\narchitecture for large language models, simultaneously achieving training\nparallelism, low-cost inference, and good performance. We theoretically derive\nthe connection between recurrence and attention. Then we propose the retention\nmechanism for sequence modeling, which supports three computation paradigms,\ni.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel\nrepresentation allows for training parallelism. The recurrent representation\nenables low-cost $O(1)$ inference, which improves decoding throughput, latency,\nand GPU memory without sacrificing performance. The chunkwise recurrent\nrepresentation facilitates efficient long-sequence modeling with linear\ncomplexity, where each chunk is encoded parallelly while recurrently\nsummarizing the chunks. Experimental results on language modeling show that\nRetNet achieves favorable scaling results, parallel training, low-cost\ndeployment, and efficient inference. The intriguing properties make RetNet a\nstrong successor to Transformer for large language models. Code will be\navailable at https://aka.ms/retnet.\n","authors":["Yutao Sun","Li Dong","Shaohan Huang","Shuming Ma","Yuqing Xia","Jilong Xue","Jianyong Wang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.08621v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12798v2","updated":"2023-07-25T05:42:34Z","published":"2023-07-24T13:51:19Z","title":"RRAML: Reinforced Retrieval Augmented Machine Learning","summary":"  The emergence of large language models (LLMs) has revolutionized machine\nlearning and related fields, showcasing remarkable abilities in comprehending,\ngenerating, and manipulating human language. However, their conventional usage\nthrough API-based text prompt submissions imposes certain limitations in terms\nof context constraints and external source availability. To address these\nchallenges, we propose a novel framework called Reinforced Retrieval Augmented\nMachine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs\nwith supporting information retrieved by a purpose-built retriever from a vast\nuser-provided database. By leveraging recent advancements in reinforcement\nlearning, our method effectively addresses several critical challenges.\nFirstly, it circumvents the need for accessing LLM gradients. Secondly, our\nmethod alleviates the burden of retraining LLMs for specific tasks, as it is\noften impractical or impossible due to restricted access to the model and the\ncomputational intensity involved. Additionally we seamlessly link the\nretriever's task with the reasoner, mitigating hallucinations and reducing\nirrelevant, and potentially damaging retrieved documents. We believe that the\nresearch agenda outlined in this paper has the potential to profoundly impact\nthe field of AI, democratizing access to and utilization of LLMs for a wide\nrange of entities.\n","authors":["Andrea Bacciu","Florin Cocunasu","Federico Siciliano","Fabrizio Silvestri","Nicola Tonellotto","Giovanni Trappolini"],"pdf_url":"https://arxiv.org/pdf/2307.12798v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13269v1","updated":"2023-07-25T05:39:21Z","published":"2023-07-25T05:39:21Z","title":"LoraHub: Efficient Cross-Task Generalization via Dynamic LoRA\n  Composition","summary":"  Low-rank adaptations (LoRA) are often employed to fine-tune large language\nmodels (LLMs) for new tasks. This paper investigates LoRA composability for\ncross-task generalization and introduces LoraHub, a strategic framework devised\nfor the purposive assembly of LoRA modules trained on diverse given tasks, with\nthe objective of achieving adaptable performance on unseen tasks. With just a\nfew examples from a novel task, LoraHub enables the fluid combination of\nmultiple LoRA modules, eradicating the need for human expertise. Notably, the\ncomposition requires neither additional model parameters nor gradients. Our\nempirical results, derived from the Big-Bench Hard (BBH) benchmark, suggest\nthat LoraHub can effectively mimic the performance of in-context learning in\nfew-shot scenarios, excluding the necessity of in-context examples alongside\neach inference input. A significant contribution of our research is the\nfostering of a community for LoRA, where users can share their trained LoRA\nmodules, thereby facilitating their application to new tasks. We anticipate\nthis resource will widen access to and spur advancements in general\nintelligence as well as LLMs in production. Code will be available at\nhttps://github.com/sail-sg/lorahub.\n","authors":["Chengsong Huang","Qian Liu","Bill Yuchen Lin","Tianyu Pang","Chao Du","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2307.13269v1.pdf","comment":"Work in progress. The first three authors contributed equally to this\n  work"},{"id":"http://arxiv.org/abs/2307.11768v2","updated":"2023-07-25T04:01:43Z","published":"2023-07-17T00:54:10Z","title":"Question Decomposition Improves the Faithfulness of Model-Generated\n  Reasoning","summary":"  As large language models (LLMs) perform more difficult tasks, it becomes\nharder to verify the correctness and safety of their behavior. One approach to\nhelp with this issue is to prompt LLMs to externalize their reasoning, e.g., by\nhaving them generate step-by-step reasoning as they answer a question\n(Chain-of-Thought; CoT). The reasoning may enable us to check the process that\nmodels use to perform tasks. However, this approach relies on the stated\nreasoning faithfully reflecting the model's actual reasoning, which is not\nalways the case. To improve over the faithfulness of CoT reasoning, we have\nmodels generate reasoning by decomposing questions into subquestions.\nDecomposition-based methods achieve strong performance on question-answering\ntasks, sometimes approaching that of CoT while improving the faithfulness of\nthe model's stated reasoning on several recently-proposed metrics. By forcing\nthe model to answer simpler subquestions in separate contexts, we greatly\nincrease the faithfulness of model-generated reasoning over CoT, while still\nachieving some of the performance gains of CoT. Our results show it is possible\nto improve the faithfulness of model-generated reasoning; continued\nimprovements may lead to reasoning that enables us to verify the correctness\nand safety of LLM behavior.\n","authors":["Ansh Radhakrishnan","Karina Nguyen","Anna Chen","Carol Chen","Carson Denison","Danny Hernandez","Esin Durmus","Evan Hubinger","Jackson Kernion","Kamilė Lukošiūtė","Newton Cheng","Nicholas Joseph","Nicholas Schiefer","Oliver Rausch","Sam McCandlish","Sheer El Showk","Tamera Lanham","Tim Maxwell","Venkatesa Chandrasekaran","Zac Hatfield-Dodds","Jared Kaplan","Jan Brauner","Samuel R. Bowman","Ethan Perez"],"pdf_url":"https://arxiv.org/pdf/2307.11768v2.pdf","comment":"For few-shot examples and prompts, see\n  https://github.com/anthropics/DecompositionFaithfulnessPaper"},{"id":"http://arxiv.org/abs/2307.11760v2","updated":"2023-07-25T03:30:32Z","published":"2023-07-14T00:57:12Z","title":"EmotionPrompt: Leveraging Psychology for Large Language Models\n  Enhancement via Emotional Stimulus","summary":"  Large language models (LLMs) have achieved significant performance in many\nfields such as reasoning, language understanding, and math problem-solving, and\nare regarded as a crucial step to artificial general intelligence (AGI).\nHowever, the sensitivity of LLMs to prompts remains a major bottleneck for\ntheir daily adoption. In this paper, we take inspiration from psychology and\npropose EmotionPrompt to explore emotional intelligence to enhance the\nperformance of LLMs. EmotionPrompt operates on a remarkably straightforward\nprinciple: the incorporation of emotional stimulus into prompts. Experimental\nresults demonstrate that our EmotionPrompt, using the same single prompt\ntemplates, significantly outperforms original zero-shot prompt and\nZero-shot-CoT on 8 tasks with diverse models: ChatGPT, Vicuna-13b, Bloom, and\nT5. Further, EmotionPrompt was observed to improve both truthfulness and\ninformativeness. We believe that EmotionPrompt heralds a novel avenue for\nexploring interdisciplinary knowledge for humans-LLMs interaction.\n","authors":["Cheng Li","Jindong Wang","Kaijie Zhu","Yixuan Zhang","Wenxin Hou","Jianxun Lian","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2307.11760v2.pdf","comment":"Work in progress; 9 pages"},{"id":"http://arxiv.org/abs/2302.03693v2","updated":"2023-07-25T00:36:06Z","published":"2023-02-07T20:43:48Z","title":"Concept Algebra for Score-Based Conditional Models","summary":"  This paper concerns the structure of learned representations in text-guided\ngenerative models, focusing on score-based models. Here, we focus on the idea\nthat concepts are encoded as subspaces (or directions) of some representation\nspace. We develop a mathematical formalization of this idea.Using this\nformalism, we show there's a natural choice of representation with this\nproperty, and we develop a simple method for identifying the part of the\nrepresentation corresponding to a given concept. In particular, this allows us\nto manipulate the concepts expressed by the model through algebraic\nmanipulation of the representation. We demonstrate the idea with examples\ntext-guided image generation, using Stable Diffusion.\n","authors":["Zihao Wang","Lin Gui","Jeffrey Negrea","Victor Veitch"],"pdf_url":"https://arxiv.org/pdf/2302.03693v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13854v1","updated":"2023-07-25T22:59:32Z","published":"2023-07-25T22:59:32Z","title":"WebArena: A Realistic Web Environment for Building Autonomous Agents","summary":"  With generative AI advances, the exciting potential for autonomous agents to\nmanage daily tasks via natural language commands has emerged. However, cur rent\nagents are primarily created and tested in simplified synthetic environments,\nsubstantially limiting real-world scenario representation. In this paper, we\nbuild an environment for agent command and control that is highly realistic and\nreproducible. Specifically, we focus on agents that perform tasks on websites,\nand we create an environment with fully functional websites from four common\ndomains: e-commerce, social forum discussions, collaborative software\ndevelopment, and content management. Our environment is enriched with tools\n(e.g., a map) and external knowledge bases (e.g., user manuals) to encourage\nhuman-like task-solving. Building upon our environment, we release a set of\nbenchmark tasks focusing on evaluating the functional correctness of task\ncompletions. The tasks in our benchmark are diverse, long-horizon, and are\ndesigned to emulate tasks that humans routinely perform on the internet. We\ndesign and implement several autonomous agents, integrating recent techniques\nsuch as reasoning before acting. The results demonstrate that solving complex\ntasks is challenging: our best GPT-4-based agent only achieves an end-to-end\ntask success rate of 10.59%. These results highlight the need for further\ndevelopment of robust agents, that current state-of-the-art LMs are far from\nperfect performance in these real-life tasks, and that WebArena can be used to\nmeasure such progress. Our code, data, environment reproduction resources, and\nvideo demonstrations are publicly available at https://webarena.dev/.\n","authors":["Shuyan Zhou","Frank F. Xu","Hao Zhu","Xuhui Zhou","Robert Lo","Abishek Sridhar","Xianyi Cheng","Yonatan Bisk","Daniel Fried","Uri Alon","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2307.13854v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2307.13829v1","updated":"2023-07-25T21:56:14Z","published":"2023-07-25T21:56:14Z","title":"ARC-NLP at Multimodal Hate Speech Event Detection 2023: Multimodal\n  Methods Boosted by Ensemble Learning, Syntactical and Entity Features","summary":"  Text-embedded images can serve as a means of spreading hate speech,\npropaganda, and extremist beliefs. Throughout the Russia-Ukraine war, both\nopposing factions heavily relied on text-embedded images as a vehicle for\nspreading propaganda and hate speech. Ensuring the effective detection of hate\nspeech and propaganda is of utmost importance to mitigate the negative effect\nof hate speech dissemination. In this paper, we outline our methodologies for\ntwo subtasks of Multimodal Hate Speech Event Detection 2023. For the first\nsubtask, hate speech detection, we utilize multimodal deep learning models\nboosted by ensemble learning and syntactical text attributes. For the second\nsubtask, target detection, we employ multimodal deep learning models boosted by\nnamed entity features. Through experimentation, we demonstrate the superior\nperformance of our models compared to all textual, visual, and text-visual\nbaselines employed in multimodal hate speech detection. Furthermore, our models\nachieve the first place in both subtasks on the final leaderboard of the shared\ntask.\n","authors":["Umitcan Sahin","Izzet Emre Kucukkaya","Oguzhan Ozcelik","Cagri Toraman"],"pdf_url":"https://arxiv.org/pdf/2307.13829v1.pdf","comment":"Submitted to CASE at RANLP 2023"},{"id":"http://arxiv.org/abs/2302.12247v2","updated":"2023-07-25T20:50:10Z","published":"2023-02-23T18:59:05Z","title":"Quantifying & Modeling Multimodal Interactions: An Information\n  Decomposition Framework","summary":"  The recent explosion of interest in multimodal applications has resulted in a\nwide selection of datasets and methods for representing and integrating\ninformation from different modalities. Despite these empirical advances, there\nremain fundamental research questions: How can we quantify the interactions\nthat are necessary to solve a multimodal task? Subsequently, what are the most\nsuitable multimodal models to capture these interactions? To answer these\nquestions, we propose an information-theoretic approach to quantify the degree\nof redundancy, uniqueness, and synergy relating input modalities with an output\ntask. We term these three measures as the PID statistics of a multimodal\ndistribution (or PID for short), and introduce two new estimators for these PID\nstatistics that scale to high-dimensional distributions. To validate PID\nestimation, we conduct extensive experiments on both synthetic datasets where\nthe PID is known and on large-scale multimodal benchmarks where PID estimations\nare compared with human annotations. Finally, we demonstrate their usefulness\nin (1) quantifying interactions within multimodal datasets, (2) quantifying\ninteractions captured by multimodal models, (3) principled approaches for model\nselection, and (4) three real-world case studies engaging with domain experts\nin pathology, mood prediction, and robotic perception where our framework helps\nto recommend strong multimodal models for each application.\n","authors":["Paul Pu Liang","Yun Cheng","Xiang Fan","Chun Kai Ling","Suzanne Nie","Richard Chen","Zihao Deng","Nicholas Allen","Randy Auerbach","Faisal Mahmood","Ruslan Salakhutdinov","Louis-Philippe Morency"],"pdf_url":"https://arxiv.org/pdf/2302.12247v2.pdf","comment":"Code available at: https://github.com/pliang279/PID"},{"id":"http://arxiv.org/abs/2307.13808v1","updated":"2023-07-25T20:24:22Z","published":"2023-07-25T20:24:22Z","title":"Watermarking Conditional Text Generation for AI Detection: Unveiling\n  Challenges and a Semantic-Aware Watermark Remedy","summary":"  To mitigate potential risks associated with language models, recent AI\ndetection research proposes incorporating watermarks into machine-generated\ntext through random vocabulary restrictions and utilizing this information for\ndetection. While these watermarks only induce a slight deterioration in\nperplexity, our empirical investigation reveals a significant detriment to the\nperformance of conditional text generation. To address this issue, we introduce\na simple yet effective semantic-aware watermarking algorithm that considers the\ncharacteristics of conditional text generation and the input context.\nExperimental results demonstrate that our proposed method yields substantial\nimprovements across various text generation models, including BART and Flan-T5,\nin tasks such as summarization and data-to-text generation while maintaining\ndetection ability.\n","authors":["Yu Fu","Deyi Xiong","Yue Dong"],"pdf_url":"https://arxiv.org/pdf/2307.13808v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.13779v1","updated":"2023-07-25T19:34:44Z","published":"2023-07-25T19:34:44Z","title":"Is GPT a Computational Model of Emotion? Detailed Analysis","summary":"  This paper investigates the emotional reasoning abilities of the GPT family\nof large language models via a component perspective. The paper first examines\nhow the model reasons about autobiographical memories. Second, it\nsystematically varies aspects of situations to impact emotion intensity and\ncoping tendencies. Even without the use of prompt engineering, it is shown that\nGPT's predictions align significantly with human-provided appraisals and\nemotional labels. However, GPT faces difficulties predicting emotion intensity\nand coping responses. GPT-4 showed the highest performance in the initial study\nbut fell short in the second, despite providing superior results after minor\nprompt engineering. This assessment brings up questions on how to effectively\nemploy the strong points and address the weak areas of these models,\nparticularly concerning response variability. These studies underscore the\nmerits of evaluating models from a componential perspective.\n","authors":["Ala N. Tak","Jonathan Gratch"],"pdf_url":"https://arxiv.org/pdf/2307.13779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13776v1","updated":"2023-07-25T19:20:50Z","published":"2023-07-25T19:20:50Z","title":"Combating the Curse of Multilinguality in Cross-Lingual WSD by Aligning\n  Sparse Contextualized Word Representations","summary":"  In this paper, we advocate for using large pre-trained monolingual language\nmodels in cross lingual zero-shot word sense disambiguation (WSD) coupled with\na contextualized mapping mechanism. We also report rigorous experiments that\nillustrate the effectiveness of employing sparse contextualized word\nrepresentations obtained via a dictionary learning procedure. Our experimental\nresults demonstrate that the above modifications yield a significant\nimprovement of nearly 6.5 points of increase in the average F-score (from 62.0\nto 68.5) over a collection of 17 typologically diverse set of target languages.\nWe release our source code for replicating our experiments at\nhttps://github.com/begab/sparsity_makes_sense.\n","authors":["Gábor Berend"],"pdf_url":"https://arxiv.org/pdf/2307.13776v1.pdf","comment":"Presented at NAACL2022"},{"id":"http://arxiv.org/abs/2209.13192v2","updated":"2023-07-25T18:12:16Z","published":"2022-09-27T06:47:42Z","title":"Direct Speech Translation for Automatic Subtitling","summary":"  Automatic subtitling is the task of automatically translating the speech of\naudiovisual content into short pieces of timed text, i.e. subtitles and their\ncorresponding timestamps. The generated subtitles need to conform to space and\ntime requirements, while being synchronised with the speech and segmented in a\nway that facilitates comprehension. Given its considerable complexity, the task\nhas so far been addressed through a pipeline of components that separately deal\nwith transcribing, translating, and segmenting text into subtitles, as well as\npredicting timestamps. In this paper, we propose the first direct ST model for\nautomatic subtitling that generates subtitles in the target language along with\ntheir timestamps with a single model. Our experiments on 7 language pairs show\nthat our approach outperforms a cascade system in the same data condition, also\nbeing competitive with production tools on both in-domain and newly-released\nout-domain benchmarks covering new scenarios.\n","authors":["Sara Papi","Marco Gaido","Alina Karakanta","Mauro Cettolo","Matteo Negri","Marco Turchi"],"pdf_url":"https://arxiv.org/pdf/2209.13192v2.pdf","comment":"Accepted at TACL"},{"id":"http://arxiv.org/abs/2307.13714v1","updated":"2023-07-25T16:08:27Z","published":"2023-07-25T16:08:27Z","title":"Diversity and Language Technology: How Techno-Linguistic Bias Can Cause\n  Epistemic Injustice","summary":"  It is well known that AI-based language technology -- large language models,\nmachine translation systems, multilingual dictionaries, and corpora -- is\ncurrently limited to 2 to 3 percent of the world's most widely spoken and/or\nfinancially and politically best supported languages. In response, recent\nresearch efforts have sought to extend the reach of AI technology to\n``underserved languages.'' In this paper, we show that many of these attempts\nproduce flawed solutions that adhere to a hard-wired representational\npreference for certain languages, which we call techno-linguistic bias.\nTechno-linguistic bias is distinct from the well-established phenomenon of\nlinguistic bias as it does not concern the languages represented but rather the\ndesign of the technologies. As we show through the paper, techno-linguistic\nbias can result in systems that can only express concepts that are part of the\nlanguage and culture of dominant powers, unable to correctly represent concepts\nfrom other communities. We argue that at the root of this problem lies a\nsystematic tendency of technology developer communities to apply a simplistic\nunderstanding of diversity which does not do justice to the more profound\ndifferences that languages, and ultimately the communities that speak them,\nembody. Drawing on the concept of epistemic injustice, we point to the broader\nsociopolitical consequences of the bias we identify and show how it can lead\nnot only to a disregard for valuable aspects of diversity but also to an\nunder-representation of the needs and diverse worldviews of marginalized\nlanguage communities.\n","authors":["Paula Helm","Gábor Bella","Gertraud Koch","Fausto Giunchiglia"],"pdf_url":"https://arxiv.org/pdf/2307.13714v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2307.13405"},{"id":"http://arxiv.org/abs/2307.13304v1","updated":"2023-07-25T07:44:06Z","published":"2023-07-25T07:44:06Z","title":"QuIP: 2-Bit Quantization of Large Language Models With Guarantees","summary":"  This work studies post-training parameter quantization in large language\nmodels (LLMs). We introduce quantization with incoherence processing (QuIP), a\nnew method based on the insight that quantization benefits from incoherent\nweight and Hessian matrices, i.e., from the weights and the directions in which\nit is important to round them accurately being unaligned with the coordinate\naxes. QuIP consists of two steps: (1) an adaptive rounding procedure minimizing\na quadratic proxy objective; (2) efficient pre- and post-processing that\nensures weight and Hessian incoherence via multiplication by random orthogonal\nmatrices. We complement QuIP with the first theoretical analysis for an\nLLM-scale quantization algorithm, and show that our theory also applies to an\nexisting method, OPTQ. Empirically, we find that our incoherence preprocessing\nimproves several existing quantization algorithms and yields the first LLM\nquantization methods that produce viable results using only two bits per\nweight. Our code can be found at https://github.com/jerry-chee/QuIP .\n","authors":["Jerry Chee","Yaohui Cai","Volodymyr Kuleshov","Christopher De Sa"],"pdf_url":"https://arxiv.org/pdf/2307.13304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14377v1","updated":"2023-07-25T17:30:38Z","published":"2023-07-25T17:30:38Z","title":"How Can Large Language Models Help Humans in Design and Manufacturing?","summary":"  The advancement of Large Language Models (LLMs), including GPT-4, provides\nexciting new opportunities for generative design. We investigate the\napplication of this tool across the entire design and manufacturing workflow.\nSpecifically, we scrutinize the utility of LLMs in tasks such as: converting a\ntext-based prompt into a design specification, transforming a design into\nmanufacturing instructions, producing a design space and design variations,\ncomputing the performance of a design, and searching for designs predicated on\nperformance. Through a series of examples, we highlight both the benefits and\nthe limitations of the current LLMs. By exposing these limitations, we aspire\nto catalyze the continued improvement and progression of these models.\n","authors":["Liane Makatura","Michael Foshey","Bohan Wang","Felix HähnLein","Pingchuan Ma","Bolei Deng","Megan Tjandrasuwita","Andrew Spielberg","Crystal Elaine Owens","Peter Yichen Chen","Allan Zhao","Amy Zhu","Wil J Norton","Edward Gu","Joshua Jacob","Yifei Li","Adriana Schulz","Wojciech Matusik"],"pdf_url":"https://arxiv.org/pdf/2307.14377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14367v1","updated":"2023-07-25T09:35:43Z","published":"2023-07-25T09:35:43Z","title":"Prot2Text: Multimodal Protein's Function Generation with GNNs and\n  Transformers","summary":"  The complex nature of big biological systems pushed some scientists to\nclassify its understanding under the inconceivable missions. Different leveled\nchallenges complicated this task, one of is the prediction of a protein's\nfunction. In recent years, significant progress has been made in this field\nthrough the development of various machine learning approaches. However, most\nexisting methods formulate the task as a multi-classification problem, i.e\nassigning predefined labels to proteins. In this work, we propose a novel\napproach, \\textbf{Prot2Text}, which predicts a protein function's in a free\ntext style, moving beyond the conventional binary or categorical\nclassifications. By combining Graph Neural Networks(GNNs) and Large Language\nModels(LLMs), in an encoder-decoder framework, our model effectively integrates\ndiverse data types including proteins' sequences, structures, and textual\nannotations. This multimodal approach allows for a holistic representation of\nproteins' functions, enabling the generation of detailed and accurate\ndescriptions. To evaluate our model, we extracted a multimodal protein dataset\nfrom SwissProt, and demonstrate empirically the effectiveness of Prot2Text.\nThese results highlight the transformative impact of multimodal models,\nspecifically the fusion of GNNs and LLMs, empowering researchers with powerful\ntools for more accurate prediction of proteins' functions. The code, the models\nand a demo will be publicly released.\n","authors":["Hadi Abdine","Michail Chatzianastasis","Costas Bouyioukos","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2307.14367v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.13697v1","updated":"2023-07-25T17:59:59Z","published":"2023-07-25T17:59:59Z","title":"Benchmarking and Analyzing Generative Data for Visual Recognition","summary":"  Advancements in large pre-trained generative models have expanded their\npotential as effective data generators in visual recognition. This work delves\ninto the impact of generative images, primarily comparing paradigms that\nharness external data (\\ie generative \\vs retrieval \\vs original).\n  Our key contributions are: \\textbf{1) GenBench Construction:} We devise\n\\textbf{GenBench}, a broad benchmark comprising 22 datasets with 2548\ncategories, to appraise generative data across various visual recognition\ntasks. \\textbf{2) CLER Score:} To address the insufficient correlation of\nexisting metrics (\\eg, FID, CLIP score) with downstream recognition\nperformance, we propose \\textbf{CLER}, a training-free metric indicating\ngenerative data's efficiency for recognition tasks prior to training.\n\\textbf{3) New Baselines:} Comparisons of generative data with retrieved data\nfrom the same external pool help to elucidate the unique traits of generative\ndata. \\textbf{4) External Knowledge Injection:} By fine-tuning special token\nembeddings for each category via Textual Inversion, performance improves across\n17 datasets, except when dealing with low-resolution reference images.\n  Our exhaustive benchmark and analysis spotlight generative data's promise in\nvisual recognition, while identifying key challenges for future investigation.\n","authors":["Bo Li","Haotian Liu","Liangyu Chen","Yong Jae Lee","Chunyuan Li","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13697v1.pdf","comment":"Research Report"},{"id":"http://arxiv.org/abs/2307.12914v2","updated":"2023-07-25T17:56:38Z","published":"2023-07-24T16:13:43Z","title":"Towards a Visual-Language Foundation Model for Computational Pathology","summary":"  The accelerated adoption of digital pathology and advances in deep learning\nhave enabled the development of powerful models for various pathology tasks\nacross a diverse array of diseases and patient cohorts. However, model training\nis often difficult due to label scarcity in the medical domain and the model's\nusage is limited by the specific task and disease for which it is trained.\nAdditionally, most models in histopathology leverage only image data, a stark\ncontrast to how humans teach each other and reason about histopathologic\nentities. We introduce CONtrastive learning from Captions for Histopathology\n(CONCH), a visual-language foundation model developed using diverse sources of\nhistopathology images, biomedical text, and notably over 1.17 million\nimage-caption pairs via task-agnostic pretraining. Evaluated on a suite of 13\ndiverse benchmarks, CONCH can be transferred to a wide range of downstream\ntasks involving either or both histopathology images and text, achieving\nstate-of-the-art performance on histology image classification, segmentation,\ncaptioning, text-to-image and image-to-text retrieval. CONCH represents a\nsubstantial leap over concurrent visual-language pretrained systems for\nhistopathology, with the potential to directly facilitate a wide array of\nmachine learning-based workflows requiring minimal or no further supervised\nfine-tuning.\n","authors":["Ming Y. Lu","Bowen Chen","Drew F. K. Williamson","Richard J. Chen","Ivy Liang","Tong Ding","Guillaume Jaume","Igor Odintsov","Andrew Zhang","Long Phi Le","Georg Gerber","Anil V Parwani","Faisal Mahmood"],"pdf_url":"https://arxiv.org/pdf/2307.12914v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1905.10448v4","updated":"2023-07-25T17:53:01Z","published":"2019-05-24T21:19:04Z","title":"Geometric Wavelet Scattering Networks on Compact Riemannian Manifolds","summary":"  The Euclidean scattering transform was introduced nearly a decade ago to\nimprove the mathematical understanding of convolutional neural networks.\nInspired by recent interest in geometric deep learning, which aims to\ngeneralize convolutional neural networks to manifold and graph-structured\ndomains, we define a geometric scattering transform on manifolds. Similar to\nthe Euclidean scattering transform, the geometric scattering transform is based\non a cascade of wavelet filters and pointwise nonlinearities. It is invariant\nto local isometries and stable to certain types of diffeomorphisms. Empirical\nresults demonstrate its utility on several geometric learning tasks. Our\nresults generalize the deformation stability and local translation invariance\nof Euclidean scattering, and demonstrate the importance of linking the used\nfilter structures to the underlying geometry of the data.\n","authors":["Michael Perlmutter","Feng Gao","Guy Wolf","Matthew Hirn"],"pdf_url":"https://arxiv.org/pdf/1905.10448v4.pdf","comment":"35 pages; 3 figures; 2 tables; v4: Fixed a minor error. Convergence\n  in Equation 13 is in L2 not p.w. modified proof of Theorem 3.3 accordingly"},{"id":"http://arxiv.org/abs/2303.06296v2","updated":"2023-07-25T17:42:37Z","published":"2023-03-11T03:30:47Z","title":"Stabilizing Transformer Training by Preventing Attention Entropy\n  Collapse","summary":"  Training stability is of great importance to Transformers. In this work, we\ninvestigate the training dynamics of Transformers by examining the evolution of\nthe attention layers. In particular, we track the attention entropy for each\nattention head during the course of training, which is a proxy for model\nsharpness. We identify a common pattern across different architectures and\ntasks, where low attention entropy is accompanied by high training instability,\nwhich can take the form of oscillating loss or divergence. We denote the\npathologically low attention entropy, corresponding to highly concentrated\nattention scores, as $\\textit{entropy collapse}$. As a remedy, we propose\n$\\sigma$Reparam, a simple and efficient solution where we reparametrize all\nlinear layers with spectral normalization and an additional learned scalar. We\ndemonstrate that $\\sigma$Reparam successfully prevents entropy collapse in the\nattention layers, promoting more stable training. Additionally, we prove a\ntight lower bound of the attention entropy, which decreases exponentially fast\nwith the spectral norm of the attention logits, providing additional motivation\nfor our approach. We conduct experiments with $\\sigma$Reparam on image\nclassification, image self-supervised learning, machine translation, speech\nrecognition, and language modeling tasks. We show that $\\sigma$Reparam provides\nstability and robustness with respect to the choice of hyperparameters, going\nso far as enabling training (a) a Vision Transformer {to competitive\nperformance} without warmup, weight decay, layer normalization or adaptive\noptimizers; (b) deep architectures in machine translation and (c) speech\nrecognition to competitive performance without warmup and adaptive optimizers.\nCode is available at \\url{https://github.com/apple/ml-sigma-reparam}.\n","authors":["Shuangfei Zhai","Tatiana Likhomanenko","Etai Littwin","Dan Busbridge","Jason Ramapuram","Yizhe Zhang","Jiatao Gu","Josh Susskind"],"pdf_url":"https://arxiv.org/pdf/2303.06296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13681v1","updated":"2023-07-25T17:39:39Z","published":"2023-07-25T17:39:39Z","title":"The Visual Language of Fabrics","summary":"  We introduce text2fabric, a novel dataset that links free-text descriptions\nto various fabric materials. The dataset comprises 15,000 natural language\ndescriptions associated to 3,000 corresponding images of fabric materials.\nTraditionally, material descriptions come in the form of tags/keywords, which\nlimits their expressivity, induces pre-existing knowledge of the appropriate\nvocabulary, and ultimately leads to a chopped description system. Therefore, we\nstudy the use of free-text as a more appropriate way to describe material\nappearance, taking the use case of fabrics as a common item that non-experts\nmay often deal with. Based on the analysis of the dataset, we identify a\ncompact lexicon, set of attributes and key structure that emerge from the\ndescriptions. This allows us to accurately understand how people describe\nfabrics and draw directions for generalization to other types of materials. We\nalso show that our dataset enables specializing large vision-language models\nsuch as CLIP, creating a meaningful latent space for fabric appearance, and\nsignificantly improving applications such as fine-grained material retrieval\nand automatic captioning.\n","authors":["Valentin Deschaintre","Julia Guerrero-Viu","Diego Gutierrez","Tamy Boubekeur","Belen Masia"],"pdf_url":"https://arxiv.org/pdf/2307.13681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13654v1","updated":"2023-07-25T17:01:10Z","published":"2023-07-25T17:01:10Z","title":"Personal Protective Equipment Detection in Extreme Construction\n  Conditions","summary":"  Object detection has been widely applied for construction safety management,\nespecially personal protective equipment (PPE) detection. Though the existing\nPPE detection models trained on conventional datasets have achieved excellent\nresults, their performance dramatically declines in extreme construction\nconditions. A robust detection model NST-YOLOv5 is developed by combining the\nneural style transfer (NST) and YOLOv5 technologies. Five extreme conditions\nare considered and simulated via the NST module to endow the detection model\nwith excellent robustness, including low light, intense light, sand dust, fog,\nand rain. Experiments show that the NST has great potential as a tool for\nextreme data synthesis since it is better at simulating extreme conditions than\nother traditional image processing algorithms and helps the NST-YOLOv5 achieve\n0.141 and 0.083 mAP_(05:95) improvements in synthesized and real-world extreme\ndata. This study provides a new feasible way to obtain a more robust detection\nmodel for extreme construction conditions.\n","authors":["Yuexiong Ding","Xiaowei Luo"],"pdf_url":"https://arxiv.org/pdf/2307.13654v1.pdf","comment":"2023 ASCE International Conference on Computing in Civil Engineering\n  (i3CE 2023)"},{"id":"http://arxiv.org/abs/2307.13646v1","updated":"2023-07-25T16:55:13Z","published":"2023-07-25T16:55:13Z","title":"QuickQual: Lightweight, convenient retinal image quality scoring with\n  off-the-shelf pretrained models","summary":"  Image quality remains a key problem for both traditional and deep learning\n(DL)-based approaches to retinal image analysis, but identifying poor quality\nimages can be time consuming and subjective. Thus, automated methods for\nretinal image quality scoring (RIQS) are needed. The current state-of-the-art\nis MCFNet, composed of three Densenet121 backbones each operating in a\ndifferent colour space. MCFNet, and the EyeQ dataset released by the same\nauthors, was a huge step forward for RIQS. We present QuickQual, a simple\napproach to RIQS, consisting of a single off-the-shelf ImageNet-pretrained\nDensenet121 backbone plus a Support Vector Machine (SVM). QuickQual performs\nvery well, setting a new state-of-the-art for EyeQ (Accuracy: 88.50% vs 88.00%\nfor MCFNet; AUC: 0.9687 vs 0.9588). This suggests that RIQS can be solved with\ngeneric perceptual features learned on natural images, as opposed to requiring\nDL models trained on large amounts of fundus images. Additionally, we propose a\nFixed Prior linearisation scheme, that converts EyeQ from a 3-way\nclassification to a continuous logistic regression task. For this task, we\npresent a second model, QuickQual MEga Minified Estimator (QuickQual-MEME),\nthat consists of only 10 parameters on top of an off-the-shelf Densenet121 and\ncan distinguish between gradable and ungradable images with an accuracy of\n89.18% (AUC: 0.9537). Code and model are available on GitHub:\nhttps://github.com/justinengelmann/QuickQual . QuickQual is so lightweight,\nthat the entire inference code (and even the parameters for QuickQual-MEME) is\nalready contained in this paper.\n","authors":["Justin Engelmann","Amos Storkey","Miguel O. Bernabeu"],"pdf_url":"https://arxiv.org/pdf/2307.13646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13645v1","updated":"2023-07-25T16:54:48Z","published":"2023-07-25T16:54:48Z","title":"Learning Transferable Object-Centric Diffeomorphic Transformations for\n  Data Augmentation in Medical Image Segmentation","summary":"  Obtaining labelled data in medical image segmentation is challenging due to\nthe need for pixel-level annotations by experts. Recent works have shown that\naugmenting the object of interest with deformable transformations can help\nmitigate this challenge. However, these transformations have been learned\nglobally for the image, limiting their transferability across datasets or\napplicability in problems where image alignment is difficult. While\nobject-centric augmentations provide a great opportunity to overcome these\nissues, existing works are only focused on position and random transformations\nwithout considering shape variations of the objects. To this end, we propose a\nnovel object-centric data augmentation model that is able to learn the shape\nvariations for the objects of interest and augment the object in place without\nmodifying the rest of the image. We demonstrated its effectiveness in improving\nkidney tumour segmentation when leveraging shape variations learned both from\nwithin the same dataset and transferred from external datasets.\n","authors":["Nilesh Kumar","Prashnna K. Gyawali","Sandesh Ghimire","Linwei Wang"],"pdf_url":"https://arxiv.org/pdf/2307.13645v1.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.13640v1","updated":"2023-07-25T16:45:35Z","published":"2023-07-25T16:45:35Z","title":"Optical Flow boosts Unsupervised Localization and Segmentation","summary":"  Unsupervised localization and segmentation are long-standing robot vision\nchallenges that describe the critical ability for an autonomous robot to learn\nto decompose images into individual objects without labeled data. These tasks\nare important because of the limited availability of dense image manual\nannotation and the promising vision of adapting to an evolving set of object\ncategories in lifelong learning. Most recent methods focus on using visual\nappearance continuity as object cues by spatially clustering features obtained\nfrom self-supervised vision transformers (ViT). In this work, we leverage\nmotion cues, inspired by the common fate principle that pixels that share\nsimilar movements tend to belong to the same object. We propose a new loss term\nformulation that uses optical flow in unlabeled videos to encourage\nself-supervised ViT features to become closer to each other if their\ncorresponding spatial locations share similar movements, and vice versa. We use\nthe proposed loss function to finetune vision transformers that were originally\ntrained on static images. Our fine-tuning procedure outperforms\nstate-of-the-art techniques for unsupervised semantic segmentation through\nlinear probing, without the use of any labeled data. This procedure also\ndemonstrates increased performance over original ViT networks across\nunsupervised object localization and semantic segmentation benchmarks.\n","authors":["Xinyu Zhang","Abdeslam Boularias"],"pdf_url":"https://arxiv.org/pdf/2307.13640v1.pdf","comment":"Accepted at IROS2023"},{"id":"http://arxiv.org/abs/2210.06433v2","updated":"2023-07-25T16:43:33Z","published":"2022-10-12T17:30:12Z","title":"Self-supervised video pretraining yields human-aligned visual\n  representations","summary":"  Humans learn powerful representations of objects and scenes by observing how\nthey evolve over time. Yet, outside of specific tasks that require explicit\ntemporal understanding, static image pretraining remains the dominant paradigm\nfor learning visual foundation models. We question this mismatch, and ask\nwhether video pretraining can yield visual representations that bear the\nhallmarks of human perception: generalisation across tasks, robustness to\nperturbations, and consistency with human judgements. To that end we propose a\nnovel procedure for curating videos, and develop a contrastive framework which\nlearns from the complex transformations therein. This simple paradigm for\ndistilling knowledge from videos, called VITO, yields general representations\nthat far outperform prior video pretraining methods on image understanding\ntasks, and image pretraining methods on video understanding tasks. Moreover,\nVITO representations are significantly more robust to natural and synthetic\ndeformations than image-, video-, and adversarially-trained ones. Finally,\nVITO's predictions are strongly aligned with human judgements, surpassing\nmodels that were specifically trained for that purpose. Together, these results\nsuggest that video pretraining could be a simple way of learning unified,\nrobust, and human-aligned representations of the visual world.\n","authors":["Nikhil Parthasarathy","S. M. Ali Eslami","João Carreira","Olivier J. Hénaff"],"pdf_url":"https://arxiv.org/pdf/2210.06433v2.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2307.13639v1","updated":"2023-07-25T16:42:06Z","published":"2023-07-25T16:42:06Z","title":"Fake It Without Making It: Conditioned Face Generation for Accurate 3D\n  Face Shape Estimation","summary":"  Accurate 3D face shape estimation is an enabling technology with applications\nin healthcare, security, and creative industries, yet current state-of-the-art\nmethods either rely on self-supervised training with 2D image data or\nsupervised training with very limited 3D data. To bridge this gap, we present a\nnovel approach which uses a conditioned stable diffusion model for face image\ngeneration, leveraging the abundance of 2D facial information to inform 3D\nspace. By conditioning stable diffusion on depth maps sampled from a 3D\nMorphable Model (3DMM) of the human face, we generate diverse and\nshape-consistent images, forming the basis of SynthFace. We introduce this\nlarge-scale synthesised dataset of 250K photorealistic images and corresponding\n3DMM parameters. We further propose ControlFace, a deep neural network, trained\non SynthFace, which achieves competitive performance on the NoW benchmark,\nwithout requiring 3D supervision or manual 3D asset creation.\n","authors":["Will Rowan","Patrik Huber","Nick Pears","Andrew Keeling"],"pdf_url":"https://arxiv.org/pdf/2307.13639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13619v1","updated":"2023-07-25T16:22:58Z","published":"2023-07-25T16:22:58Z","title":"RecursiveDet: End-to-End Region-based Recursive Object Detection","summary":"  End-to-end region-based object detectors like Sparse R-CNN usually have\nmultiple cascade bounding box decoding stages, which refine the current\npredictions according to their previous results. Model parameters within each\nstage are independent, evolving a huge cost. In this paper, we find the general\nsetting of decoding stages is actually redundant. By simply sharing parameters\nand making a recursive decoder, the detector already obtains a significant\nimprovement. The recursive decoder can be further enhanced by positional\nencoding (PE) of the proposal box, which makes it aware of the exact locations\nand sizes of input bounding boxes, thus becoming adaptive to proposals from\ndifferent stages during the recursion. Moreover, we also design\ncenterness-based PE to distinguish the RoI feature element and dynamic\nconvolution kernels at different positions within the bounding box. To validate\nthe effectiveness of the proposed method, we conduct intensive ablations and\nbuild the full model on three recent mainstream region-based detectors. The\nRecusiveDet is able to achieve obvious performance boosts with even fewer model\nparameters and slightly increased computation cost. Codes are available at\nhttps://github.com/bravezzzzzz/RecursiveDet.\n","authors":["Jing Zhao","Li Sun","Qingli Li"],"pdf_url":"https://arxiv.org/pdf/2307.13619v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13606v1","updated":"2023-07-25T16:15:29Z","published":"2023-07-25T16:15:29Z","title":"Object-based Probabilistic Similarity Evidence of Sparse Latent Features\n  from Fully Convolutional Networks","summary":"  Similarity analysis using neural networks has emerged as a powerful technique\nfor understanding and categorizing complex patterns in various domains. By\nleveraging the latent representations learned by neural networks, data objects\nsuch as images can be compared effectively. This research explores the\nutilization of latent information generated by fully convolutional networks\n(FCNs) in similarity analysis, notably to estimate the visual resemblance of\nobjects segmented in 2D pictures. To do this, the analytical scheme comprises\ntwo steps: (1) extracting and transforming feature patterns per 2D object from\na trained FCN, and (2) identifying the most similar patterns through fuzzy\ninference. The step (2) can be further enhanced by incorporating a weighting\nscheme that considers the significance of latent variables in the analysis. The\nresults provide valuable insights into the benefits and challenges of employing\nneural network-based similarity analysis for discerning data patterns\neffectively.\n","authors":["Cyril Juliani"],"pdf_url":"https://arxiv.org/pdf/2307.13606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13600v1","updated":"2023-07-25T16:03:47Z","published":"2023-07-25T16:03:47Z","title":"Decisive Data using Multi-Modality Optical Sensors for Advanced\n  Vehicular Systems","summary":"  Optical sensors have played a pivotal role in acquiring real world data for\ncritical applications. This data, when integrated with advanced machine\nlearning algorithms provides meaningful information thus enhancing human\nvision. This paper focuses on various optical technologies for design and\ndevelopment of state-of-the-art out-cabin forward vision systems and in-cabin\ndriver monitoring systems. The focused optical sensors include Longwave Thermal\nImaging (LWIR) cameras, Near Infrared (NIR), Neuromorphic/ event cameras,\nVisible CMOS cameras and Depth cameras. Further the paper discusses different\npotential applications which can be employed using the unique strengths of each\nthese optical modalities in real time environment.\n","authors":["Muhammad Ali Farooq","Waseem Shariff","Mehdi Sefidgar Dilmaghani","Wang Yao","Moazam Soomro","Peter Corcoran"],"pdf_url":"https://arxiv.org/pdf/2307.13600v1.pdf","comment":"The Paper is accepted in 25th Irish Machine Vision and Image\n  Processing Conference (IMVIP23)"},{"id":"http://arxiv.org/abs/2307.13567v1","updated":"2023-07-25T15:20:19Z","published":"2023-07-25T15:20:19Z","title":"Mystique: Deconstructing SVG Charts for Layout Reuse","summary":"  To facilitate the reuse of existing charts, previous research has examined\nhow to obtain a semantic understanding of a chart by deconstructing its visual\nrepresentation into reusable components, such as encodings. However, existing\ndeconstruction approaches primarily focus on chart styles, handling only basic\nlayouts. In this paper, we investigate how to deconstruct chart layouts,\nfocusing on rectangle-based ones as they cover not only 17 chart types but also\nadvanced layouts (e.g., small multiples, nested layouts). We develop an\ninteractive tool, called Mystique, adopting a mixed-initiative approach to\nextract the axes and legend, and deconstruct a chart's layout into four\nsemantic components: mark groups, spatial relationships, data encodings, and\ngraphical constraints. Mystique employs a wizard interface that guides chart\nauthors through a series of steps to specify how the deconstructed components\nmap to their own data. On 150 rectangle-based SVG charts, Mystique achieves\nabove 85% accuracy for axis and legend extraction and 96% accuracy for layout\ndeconstruction. In a chart reproduction study, participants could easily reuse\nexisting charts on new datasets. We discuss the current limitations of Mystique\nand future research directions.\n","authors":["Chen Chen","Bongshin Lee","Yunhai Wang","Yunjeong Chang","Zhicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13567v1.pdf","comment":"To appear at the 2023 IEEE Visualization Conference"},{"id":"http://arxiv.org/abs/2307.12493v2","updated":"2023-07-25T15:17:25Z","published":"2023-07-24T02:50:44Z","title":"TF-ICON: Diffusion-Based Training-Free Cross-Domain Image Composition","summary":"  Text-driven diffusion models have exhibited impressive generative\ncapabilities, enabling various image editing tasks. In this paper, we propose\nTF-ICON, a novel Training-Free Image COmpositioN framework that harnesses the\npower of text-driven diffusion models for cross-domain image-guided\ncomposition. This task aims to seamlessly integrate user-provided objects into\na specific visual context. Current diffusion-based methods often involve costly\ninstance-based optimization or finetuning of pretrained models on customized\ndatasets, which can potentially undermine their rich prior. In contrast,\nTF-ICON can leverage off-the-shelf diffusion models to perform cross-domain\nimage-guided composition without requiring additional training, finetuning, or\noptimization. Moreover, we introduce the exceptional prompt, which contains no\ninformation, to facilitate text-driven diffusion models in accurately inverting\nreal images into latent representations, forming the basis for compositing. Our\nexperiments show that equipping Stable Diffusion with the exceptional prompt\noutperforms state-of-the-art inversion methods on various datasets (CelebA-HQ,\nCOCO, and ImageNet), and that TF-ICON surpasses prior baselines in versatile\nvisual domains. Code is available at https://github.com/Shilin-LU/TF-ICON\n","authors":["Shilin Lu","Yanzhu Liu","Adams Wai-Kin Kong"],"pdf_url":"https://arxiv.org/pdf/2307.12493v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.05747v2","updated":"2023-07-25T15:16:33Z","published":"2023-07-08T14:14:55Z","title":"Integrating Curricula with Replays: Its Effects on Continual Learning","summary":"  Humans engage in learning and reviewing processes with curricula when\nacquiring new skills or knowledge. This human learning behavior has inspired\nthe integration of curricula with replay methods in continual learning agents.\nThe goal is to emulate the human learning process, thereby improving knowledge\nretention and facilitating learning transfer. Existing replay methods in\ncontinual learning agents involve the random selection and ordering of data\nfrom previous tasks, which has shown to be effective. However, limited research\nhas explored the integration of different curricula with replay methods to\nenhance continual learning. Our study takes initial steps in examining the\nimpact of integrating curricula with replay methods on continual learning in\nthree specific aspects: the interleaved frequency of replayed exemplars with\ntraining data, the sequence in which exemplars are replayed, and the strategy\nfor selecting exemplars into the replay buffer. These aspects of curricula\ndesign align with cognitive psychology principles and leverage the benefits of\ninterleaved practice during replays, easy-to-hard rehearsal, and exemplar\nselection strategy involving exemplars from a uniform distribution of\ndifficulties. Based on our results, these three curricula effectively mitigated\ncatastrophic forgetting and enhanced positive knowledge transfer, demonstrating\nthe potential of curricula in advancing continual learning methodologies. Our\ncode and data are available:\nhttps://github.com/ZhangLab-DeepNeuroCogLab/Integrating-Curricula-with-Replays\n","authors":["Ren Jie Tee","Mengmi Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.05747v2.pdf","comment":"8 pages, 6 figures, accepted in AAAI Summer Symposium Series\n  Proceedings"},{"id":"http://arxiv.org/abs/2307.13541v1","updated":"2023-07-25T14:44:41Z","published":"2023-07-25T14:44:41Z","title":"Group Activity Recognition in Computer Vision: A Comprehensive Review,\n  Challenges, and Future Perspectives","summary":"  Group activity recognition is a hot topic in computer vision. Recognizing\nactivities through group relationships plays a vital role in group activity\nrecognition. It holds practical implications in various scenarios, such as\nvideo analysis, surveillance, automatic driving, and understanding social\nactivities. The model's key capabilities encompass efficiently modeling\nhierarchical relationships within a scene and accurately extracting distinctive\nspatiotemporal features from groups. Given this technology's extensive\napplicability, identifying group activities has garnered significant research\nattention. This work examines the current progress in technology for\nrecognizing group activities, with a specific focus on global interactivity and\nactivities. Firstly, we comprehensively review the pertinent literature and\nvarious group activity recognition approaches, from traditional methodologies\nto the latest methods based on spatial structure, descriptors, non-deep\nlearning, hierarchical recurrent neural networks (HRNN), relationship models,\nand attention mechanisms. Subsequently, we present the relational network and\nrelational architectures for each module. Thirdly, we investigate methods for\nrecognizing group activity and compare their performance with state-of-the-art\ntechnologies. We summarize the existing challenges and provide comprehensive\nguidance for newcomers to understand group activity recognition. Furthermore,\nwe review emerging perspectives in group activity recognition to explore new\ndirections and possibilities.\n","authors":["Chuanchuan Wang","Ahmad Sufril Azlan Mohamed"],"pdf_url":"https://arxiv.org/pdf/2307.13541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13539v1","updated":"2023-07-25T14:40:11Z","published":"2023-07-25T14:40:11Z","title":"Model Calibration in Dense Classification with Adaptive Label\n  Perturbation","summary":"  For safety-related applications, it is crucial to produce trustworthy deep\nneural networks whose prediction is associated with confidence that can\nrepresent the likelihood of correctness for subsequent decision-making.\nExisting dense binary classification models are prone to being over-confident.\nTo improve model calibration, we propose Adaptive Stochastic Label Perturbation\n(ASLP) which learns a unique label perturbation level for each training image.\nASLP employs our proposed Self-Calibrating Binary Cross Entropy (SC-BCE) loss,\nwhich unifies label perturbation processes including stochastic approaches\n(like DisturbLabel), and label smoothing, to correct calibration while\nmaintaining classification rates. ASLP follows Maximum Entropy Inference of\nclassic statistical mechanics to maximise prediction entropy with respect to\nmissing information. It performs this while: (1) preserving classification\naccuracy on known data as a conservative solution, or (2) specifically improves\nmodel calibration degree by minimising the gap between the prediction accuracy\nand expected confidence of the target training label. Extensive results\ndemonstrate that ASLP can significantly improve calibration degrees of dense\nbinary classification models on both in-distribution and out-of-distribution\ndata. The code is available on https://github.com/Carlisle-Liu/ASLP.\n","authors":["Jiawei Liu","Changkun Ye","Shan Wang","Ruikai Cui","Jing Zhang","Kaihao Zhang","Nick Barnes"],"pdf_url":"https://arxiv.org/pdf/2307.13539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13537v1","updated":"2023-07-25T14:35:25Z","published":"2023-07-25T14:35:25Z","title":"Spectrum-guided Multi-granularity Referring Video Object Segmentation","summary":"  Current referring video object segmentation (R-VOS) techniques extract\nconditional kernels from encoded (low-resolution) vision-language features to\nsegment the decoded high-resolution features. We discovered that this causes\nsignificant feature drift, which the segmentation kernels struggle to perceive\nduring the forward computation. This negatively affects the ability of\nsegmentation kernels. To address the drift problem, we propose a\nSpectrum-guided Multi-granularity (SgMg) approach, which performs direct\nsegmentation on the encoded features and employs visual details to further\noptimize the masks. In addition, we propose Spectrum-guided Cross-modal Fusion\n(SCF) to perform intra-frame global interactions in the spectral domain for\neffective multimodal representation. Finally, we extend SgMg to perform\nmulti-object R-VOS, a new paradigm that enables simultaneous segmentation of\nmultiple referred objects in a video. This not only makes R-VOS faster, but\nalso more practical. Extensive experiments show that SgMg achieves\nstate-of-the-art performance on four video benchmark datasets, outperforming\nthe nearest competitor by 2.8% points on Ref-YouTube-VOS. Our extended SgMg\nenables multi-object R-VOS, runs about 3 times faster while maintaining\nsatisfactory performance. Code is available at https://github.com/bo-miao/SgMg.\n","authors":["Bo Miao","Mohammed Bennamoun","Yongsheng Gao","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2307.13537v1.pdf","comment":"Accepted by ICCV 2023, code is at https://github.com/bo-miao/SgMg"},{"id":"http://arxiv.org/abs/2307.02150v3","updated":"2023-07-25T14:32:41Z","published":"2023-07-05T09:46:41Z","title":"Harmonizing Feature Attributions Across Deep Learning Architectures:\n  Enhancing Interpretability and Consistency","summary":"  Ensuring the trustworthiness and interpretability of machine learning models\nis critical to their deployment in real-world applications. Feature attribution\nmethods have gained significant attention, which provide local explanations of\nmodel predictions by attributing importance to individual input features. This\nstudy examines the generalization of feature attributions across various deep\nlearning architectures, such as convolutional neural networks (CNNs) and vision\ntransformers. We aim to assess the feasibility of utilizing a feature\nattribution method as a future detector and examine how these features can be\nharmonized across multiple models employing distinct architectures but trained\non the same data distribution. By exploring this harmonization, we aim to\ndevelop a more coherent and optimistic understanding of feature attributions,\nenhancing the consistency of local explanations across diverse deep-learning\nmodels. Our findings highlight the potential for harmonized feature attribution\nmethods to improve interpretability and foster trust in machine learning\napplications, regardless of the underlying architecture.\n","authors":["Md Abdul Kadir","Gowtham Krishna Addluri","Daniel Sonntag"],"pdf_url":"https://arxiv.org/pdf/2307.02150v3.pdf","comment":"This version of the contribution has been submitted in KI2023"},{"id":"http://arxiv.org/abs/2307.10745v2","updated":"2023-07-25T14:25:20Z","published":"2023-07-20T10:16:03Z","title":"EdgeAL: An Edge Estimation Based Active Learning Approach for OCT\n  Segmentation","summary":"  Active learning algorithms have become increasingly popular for training\nmodels with limited data. However, selecting data for annotation remains a\nchallenging problem due to the limited information available on unseen data. To\naddress this issue, we propose EdgeAL, which utilizes the edge information of\nunseen images as {\\it a priori} information for measuring uncertainty. The\nuncertainty is quantified by analyzing the divergence and entropy in model\npredictions across edges. This measure is then used to select superpixels for\nannotation. We demonstrate the effectiveness of EdgeAL on multi-class Optical\nCoherence Tomography (OCT) segmentation tasks, where we achieved a 99% dice\nscore while reducing the annotation label cost to 12%, 2.3%, and 3%,\nrespectively, on three publicly available datasets (Duke, AROI, and UMN). The\nsource code is available at \\url{https://github.com/Mak-Ta-Reque/EdgeAL}\n","authors":["Md Abdul Kadir","Hasan Md Tusfiqur Alam","Daniel Sonntag"],"pdf_url":"https://arxiv.org/pdf/2307.10745v2.pdf","comment":"This version of the contribution has been submitted in miccai2023"},{"id":"http://arxiv.org/abs/2307.13529v1","updated":"2023-07-25T14:20:52Z","published":"2023-07-25T14:20:52Z","title":"Re-mine, Learn and Reason: Exploring the Cross-modal Semantic\n  Correlations for Language-guided HOI detection","summary":"  Human-Object Interaction (HOI) detection is a challenging computer vision\ntask that requires visual models to address the complex interactive\nrelationship between humans and objects and predict HOI triplets. Despite the\nchallenges posed by the numerous interaction combinations, they also offer\nopportunities for multimodal learning of visual texts. In this paper, we\npresent a systematic and unified framework (RmLR) that enhances HOI detection\nby incorporating structured text knowledge. Firstly, we qualitatively and\nquantitatively analyze the loss of interaction information in the two-stage HOI\ndetector and propose a re-mining strategy to generate more comprehensive visual\nrepresentation.Secondly, we design more fine-grained sentence- and word-level\nalignment and knowledge transfer strategies to effectively address the\nmany-to-many matching problem between multiple interactions and multiple\ntexts.These strategies alleviate the matching confusion problem that arises\nwhen multiple interactions occur simultaneously, thereby improving the\neffectiveness of the alignment process. Finally, HOI reasoning by visual\nfeatures augmented with textual knowledge substantially improves the\nunderstanding of interactions. Experimental results illustrate the\neffectiveness of our approach, where state-of-the-art performance is achieved\non public benchmarks. We further analyze the effects of different components of\nour approach to provide insights into its efficacy.\n","authors":["Yichao Cao","Xiu Su","Qingfei Tang","Feng Yang","Shan You","Xiaobo Lu","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2307.13529v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2307.13527v1","updated":"2023-07-25T14:18:58Z","published":"2023-07-25T14:18:58Z","title":"Not with my name! Inferring artists' names of input strings employed by\n  Diffusion Models","summary":"  Diffusion Models (DM) are highly effective at generating realistic,\nhigh-quality images. However, these models lack creativity and merely compose\noutputs based on their training data, guided by a textual input provided at\ncreation time. Is it acceptable to generate images reminiscent of an artist,\nemploying his name as input? This imply that if the DM is able to replicate an\nartist's work then it was trained on some or all of his artworks thus violating\ncopyright. In this paper, a preliminary study to infer the probability of use\nof an artist's name in the input string of a generated image is presented. To\nthis aim we focused only on images generated by the famous DALL-E 2 and\ncollected images (both original and generated) of five renowned artists.\nFinally, a dedicated Siamese Neural Network was employed to have a first kind\nof probability. Experimental results demonstrate that our approach is an\noptimal starting point and can be employed as a prior for predicting a complete\ninput string of an investigated image. Dataset and code are available at:\nhttps://github.com/ictlab-unict/not-with-my-name .\n","authors":["Roberto Leotta","Oliver Giudice","Luca Guarnera","Sebastiano Battiato"],"pdf_url":"https://arxiv.org/pdf/2307.13527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12274v2","updated":"2023-07-25T14:17:09Z","published":"2023-07-23T09:34:13Z","title":"FDCT: Fast Depth Completion for Transparent Objects","summary":"  Depth completion is crucial for many robotic tasks such as autonomous\ndriving, 3-D reconstruction, and manipulation. Despite the significant\nprogress, existing methods remain computationally intensive and often fail to\nmeet the real-time requirements of low-power robotic platforms. Additionally,\nmost methods are designed for opaque objects and struggle with transparent\nobjects due to the special properties of reflection and refraction. To address\nthese challenges, we propose a Fast Depth Completion framework for Transparent\nobjects (FDCT), which also benefits downstream tasks like object pose\nestimation. To leverage local information and avoid overfitting issues when\nintegrating it with global information, we design a new fusion branch and\nshortcuts to exploit low-level features and a loss function to suppress\noverfitting. This results in an accurate and user-friendly depth rectification\nframework which can recover dense depth estimation from RGB-D images alone.\nExtensive experiments demonstrate that FDCT can run about 70 FPS with a higher\naccuracy than the state-of-the-art methods. We also demonstrate that FDCT can\nimprove pose estimation in object grasping tasks. The source code is available\nat https://github.com/Nonmy/FDCT\n","authors":["Tianan Li","Zhehan Chen","Huan Liu","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2307.12274v2.pdf","comment":"9pages,7figures"},{"id":"http://arxiv.org/abs/2302.03114v3","updated":"2023-07-25T14:11:18Z","published":"2023-02-06T20:33:16Z","title":"From CAD models to soft point cloud labels: An automatic annotation\n  pipeline for cheaply supervised 3D semantic segmentation","summary":"  We propose a fully automatic annotation scheme that takes a raw 3D point\ncloud with a set of fitted CAD models as input and outputs convincing\npoint-wise labels that can be used as cheap training data for point cloud\nsegmentation. Compared with manual annotations, we show that our automatic\nlabels are accurate while drastically reducing the annotation time and\neliminating the need for manual intervention or dataset-specific parameters.\nOur labeling pipeline outputs semantic classes and soft point-wise object\nscores, which can either be binarized into standard one-hot-encoded labels,\nthresholded into weak labels with ambiguous points left unlabeled, or used\ndirectly as soft labels during training. We evaluate the label quality and\nsegmentation performance of PointNet++ on a dataset of real industrial point\nclouds and Scan2CAD, a public dataset of indoor scenes. Our results indicate\nthat reducing supervision in areas that are more difficult to label\nautomatically is beneficial compared with the conventional approach of naively\nassigning a hard \"best guess\" label to every point.\n","authors":["Galadrielle Humblot-Renaux","Simon Buus Jensen","Andreas Møgelmose"],"pdf_url":"https://arxiv.org/pdf/2302.03114v3.pdf","comment":"updated version, published in the Remote Sensing journal"},{"id":"http://arxiv.org/abs/2304.14108v4","updated":"2023-07-25T14:07:03Z","published":"2023-04-27T11:37:18Z","title":"DataComp: In search of the next generation of multimodal datasets","summary":"  Multimodal datasets are a critical component in recent breakthroughs such as\nStable Diffusion and GPT-4, yet their design does not receive the same research\nattention as model architectures or training algorithms. To address this\nshortcoming in the ML ecosystem, we introduce DataComp, a testbed for dataset\nexperiments centered around a new candidate pool of 12.8 billion image-text\npairs from Common Crawl. Participants in our benchmark design new filtering\ntechniques or curate new data sources and then evaluate their new dataset by\nrunning our standardized CLIP training code and testing the resulting model on\n38 downstream test sets. Our benchmark consists of multiple compute scales\nspanning four orders of magnitude, which enables the study of scaling trends\nand makes the benchmark accessible to researchers with varying resources. Our\nbaseline experiments show that the DataComp workflow leads to better training\nsets. In particular, our best baseline, DataComp-1B, enables training a CLIP\nViT-L/14 from scratch to 79.2% zero-shot accuracy on ImageNet, outperforming\nOpenAI's CLIP ViT-L/14 by 3.7 percentage points while using the same training\nprocedure and compute. We release DataComp and all accompanying code at\nwww.datacomp.ai.\n","authors":["Samir Yitzhak Gadre","Gabriel Ilharco","Alex Fang","Jonathan Hayase","Georgios Smyrnis","Thao Nguyen","Ryan Marten","Mitchell Wortsman","Dhruba Ghosh","Jieyu Zhang","Eyal Orgad","Rahim Entezari","Giannis Daras","Sarah Pratt","Vivek Ramanujan","Yonatan Bitton","Kalyani Marathe","Stephen Mussmann","Richard Vencu","Mehdi Cherti","Ranjay Krishna","Pang Wei Koh","Olga Saukh","Alexander Ratner","Shuran Song","Hannaneh Hajishirzi","Ali Farhadi","Romain Beaumont","Sewoong Oh","Alex Dimakis","Jenia Jitsev","Yair Carmon","Vaishaal Shankar","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2304.14108v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13510v1","updated":"2023-07-25T14:02:02Z","published":"2023-07-25T14:02:02Z","title":"HeightFormer: Explicit Height Modeling without Extra Data for\n  Camera-only 3D Object Detection in Bird's Eye View","summary":"  Vision-based Bird's Eye View (BEV) representation is an emerging perception\nformulation for autonomous driving. The core challenge is to construct BEV\nspace with multi-camera features, which is a one-to-many ill-posed problem.\nDiving into all previous BEV representation generation methods, we found that\nmost of them fall into two types: modeling depths in image views or modeling\nheights in the BEV space, mostly in an implicit way. In this work, we propose\nto explicitly model heights in the BEV space, which needs no extra data like\nLiDAR and can fit arbitrary camera rigs and types compared to modeling depths.\nTheoretically, we give proof of the equivalence between height-based methods\nand depth-based methods. Considering the equivalence and some advantages of\nmodeling heights, we propose HeightFormer, which models heights and\nuncertainties in a self-recursive way. Without any extra data, the proposed\nHeightFormer could estimate heights in BEV accurately. Benchmark results show\nthat the performance of HeightFormer achieves SOTA compared with those\ncamera-only methods.\n","authors":["Yiming Wu","Ruixiang Li","Zequn Qin","Xinhai Zhao","Xi Li"],"pdf_url":"https://arxiv.org/pdf/2307.13510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00334v4","updated":"2023-07-25T14:00:52Z","published":"2023-03-01T08:54:56Z","title":"Online Streaming Video Super-Resolution with Convolutional Look-Up Table","summary":"  Online video streaming has fundamental limitations on the transmission\nbandwidth and computational capacity and super-resolution is a promising\npotential solution. However, applying existing video super-resolution methods\nto online streaming is non-trivial. Existing video codecs and streaming\nprotocols (\\eg, WebRTC) dynamically change the video quality both spatially and\ntemporally, which leads to diverse and dynamic degradations. Furthermore,\nonline streaming has a strict requirement for latency that most existing\nmethods are less applicable. As a result, this paper focuses on the rarely\nexploited problem setting of online streaming video super resolution. To\nfacilitate the research on this problem, a new benchmark dataset named\nLDV-WebRTC is constructed based on a real-world online streaming system.\nLeveraging the new benchmark dataset, we proposed a novel method specifically\nfor online video streaming, which contains a convolution and Look-Up Table\n(LUT) hybrid model to achieve better performance-latency trade-off. To tackle\nthe changing degradations, we propose a mixture-of-expert-LUT module, where a\nset of LUT specialized in different degradations are built and adaptively\ncombined to handle different degradations. Experiments show our method achieves\n720P video SR around 100 FPS, while significantly outperforms existing\nLUT-based methods and offers competitive performance compared to efficient\nCNN-based methods.\n","authors":["Guanghao Yin","Zefan Qu","Xinyang Jiang","Shan Jiang","Zhenhua Han","Ningxin Zheng","Xiaohong Liu","Huan Yang","Yuqing Yang","Dongsheng Li","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2303.00334v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13492v1","updated":"2023-07-25T13:35:45Z","published":"2023-07-25T13:35:45Z","title":"NormAUG: Normalization-guided Augmentation for Domain Generalization","summary":"  Deep learning has made significant advancements in supervised learning.\nHowever, models trained in this setting often face challenges due to domain\nshift between training and test sets, resulting in a significant drop in\nperformance during testing. To address this issue, several domain\ngeneralization methods have been developed to learn robust and domain-invariant\nfeatures from multiple training domains that can generalize well to unseen test\ndomains. Data augmentation plays a crucial role in achieving this goal by\nenhancing the diversity of the training data. In this paper, inspired by the\nobservation that normalizing an image with different statistics generated by\ndifferent batches with various domains can perturb its feature, we propose a\nsimple yet effective method called NormAUG (Normalization-guided Augmentation).\nOur method includes two paths: the main path and the auxiliary (augmented)\npath. During training, the auxiliary path includes multiple sub-paths, each\ncorresponding to batch normalization for a single domain or a random\ncombination of multiple domains. This introduces diverse information at the\nfeature level and improves the generalization of the main path. Moreover, our\nNormAUG method effectively reduces the existing upper boundary for\ngeneralization based on theoretical perspectives. During the test stage, we\nleverage an ensemble strategy to combine the predictions from the auxiliary\npath of our model, further boosting performance. Extensive experiments are\nconducted on multiple benchmark datasets to validate the effectiveness of our\nproposed method.\n","authors":["Lei Qi","Hongpeng Yang","Yinghuan Shi","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2307.13492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13485v1","updated":"2023-07-25T13:22:24Z","published":"2023-07-25T13:22:24Z","title":"Cos R-CNN for Online Few-shot Object Detection","summary":"  We propose Cos R-CNN, a simple exemplar-based R-CNN formulation that is\ndesigned for online few-shot object detection. That is, it is able to localise\nand classify novel object categories in images with few examples without\nfine-tuning. Cos R-CNN frames detection as a learning-to-compare task: unseen\nclasses are represented as exemplar images, and objects are detected based on\ntheir similarity to these exemplars. The cosine-based classification head\nallows for dynamic adaptation of classification parameters to the exemplar\nembedding, and encourages the clustering of similar classes in embedding space\nwithout the need for manual tuning of distance-metric hyperparameters. This\nsimple formulation achieves best results on the recently proposed 5-way\nImageNet few-shot detection benchmark, beating the online 1/5/10-shot scenarios\nby more than 8/3/1%, as well as performing up to 20% better in online 20-way\nfew-shot VOC across all shots on novel classes.\n","authors":["Gratianus Wesley Putra Data","Henry Howard-Jenkins","David Murray","Victor Prisacariu"],"pdf_url":"https://arxiv.org/pdf/2307.13485v1.pdf","comment":"Unpublished tech report from 2020"},{"id":"http://arxiv.org/abs/2307.13463v1","updated":"2023-07-25T12:47:21Z","published":"2023-07-25T12:47:21Z","title":"Unlocking the Emotional World of Visual Media: An Overview of the\n  Science, Research, and Impact of Understanding Emotion","summary":"  The emergence of artificial emotional intelligence technology is\nrevolutionizing the fields of computers and robotics, allowing for a new level\nof communication and understanding of human behavior that was once thought\nimpossible. While recent advancements in deep learning have transformed the\nfield of computer vision, automated understanding of evoked or expressed\nemotions in visual media remains in its infancy. This foundering stems from the\nabsence of a universally accepted definition of \"emotion\", coupled with the\ninherently subjective nature of emotions and their intricate nuances. In this\narticle, we provide a comprehensive, multidisciplinary overview of the field of\nemotion analysis in visual media, drawing on insights from psychology,\nengineering, and the arts. We begin by exploring the psychological foundations\nof emotion and the computational principles that underpin the understanding of\nemotions from images and videos. We then review the latest research and systems\nwithin the field, accentuating the most promising approaches. We also discuss\nthe current technological challenges and limitations of emotion analysis,\nunderscoring the necessity for continued investigation and innovation. We\ncontend that this represents a \"Holy Grail\" research problem in computing and\ndelineate pivotal directions for future inquiry. Finally, we examine the\nethical ramifications of emotion-understanding technologies and contemplate\ntheir potential societal impacts. Overall, this article endeavors to equip\nreaders with a deeper understanding of the domain of emotion analysis in visual\nmedia and to inspire further research and development in this captivating and\nrapidly evolving field.\n","authors":["James Z. Wang","Sicheng Zhao","Chenyan Wu","Reginald B. Adams","Michelle G. Newman","Tal Shafir","Rachelle Tsachor"],"pdf_url":"https://arxiv.org/pdf/2307.13463v1.pdf","comment":"Proceedings of the IEEE 2023"},{"id":"http://arxiv.org/abs/2307.13459v1","updated":"2023-07-25T12:40:24Z","published":"2023-07-25T12:40:24Z","title":"Weakly-supervised 3D Pose Transfer with Keypoints","summary":"  The main challenges of 3D pose transfer are: 1) Lack of paired training data\nwith different characters performing the same pose; 2) Disentangling pose and\nshape information from the target mesh; 3) Difficulty in applying to meshes\nwith different topologies. We thus propose a novel weakly-supervised\nkeypoint-based framework to overcome these difficulties. Specifically, we use a\ntopology-agnostic keypoint detector with inverse kinematics to compute\ntransformations between the source and target meshes. Our method only requires\nsupervision on the keypoints, can be applied to meshes with different\ntopologies and is shape-invariant for the target which allows extraction of\npose-only information from the target meshes without transferring shape\ninformation. We further design a cycle reconstruction to perform\nself-supervised pose transfer without the need for ground truth deformed mesh\nwith the same pose and shape as the target and source, respectively. We\nevaluate our approach on benchmark human and animal datasets, where we achieve\nsuperior performance compared to the state-of-the-art unsupervised approaches\nand even comparable performance with the fully supervised approaches. We test\non the more challenging Mixamo dataset to verify our approach's ability in\nhandling meshes with different topologies and complex clothes. Cross-dataset\nevaluation further shows the strong generalization ability of our approach.\n","authors":["Jinnan Chen","Chen Li","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2307.13459v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2003.03229v4","updated":"2023-07-25T12:25:13Z","published":"2020-02-02T21:09:39Z","title":"Non-linear Neurons with Human-like Apical Dendrite Activations","summary":"  In order to classify linearly non-separable data, neurons are typically\norganized into multi-layer neural networks that are equipped with at least one\nhidden layer. Inspired by some recent discoveries in neuroscience, we propose a\nnew model of artificial neuron along with a novel activation function enabling\nthe learning of nonlinear decision boundaries using a single neuron. We show\nthat a standard neuron followed by our novel apical dendrite activation (ADA)\ncan learn the XOR logical function with 100% accuracy. Furthermore, we conduct\nexperiments on six benchmark data sets from computer vision, signal processing\nand natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST,\nTiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions\nprovide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and\nSwish, for various neural network architectures, e.g. one-hidden-layer or\ntwo-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural\nnetworks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain\nfurther performance improvements when we change the standard model of the\nneuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our\ncode is available at: https://github.com/raduionescu/pynada.\n","authors":["Mariana-Iuliana Georgescu","Radu Tudor Ionescu","Nicolae-Catalin Ristea","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2003.03229v4.pdf","comment":"Accepted for publication in Applied Intelligence"},{"id":"http://arxiv.org/abs/2303.09941v4","updated":"2023-07-25T12:24:47Z","published":"2023-03-17T12:55:22Z","title":"Leaping Into Memories: Space-Time Deep Feature Synthesis","summary":"  The success of deep learning models has led to their adaptation and adoption\nby prominent video understanding methods. The majority of these approaches\nencode features in a joint space-time modality for which the inner workings and\nlearned representations are difficult to visually interpret. We propose LEArned\nPreconscious Synthesis (LEAPS), an architecture-independent method for\nsynthesizing videos from the internal spatiotemporal representations of models.\nUsing a stimulus video and a target class, we prime a fixed space-time model\nand iteratively optimize a video initialized with random noise. Additional\nregularizers are used to improve the feature diversity of the synthesized\nvideos alongside the cross-frame temporal coherence of motions. We\nquantitatively and qualitatively evaluate the applicability of LEAPS by\ninverting a range of spatiotemporal convolutional and attention-based\narchitectures trained on Kinetics-400, which to the best of our knowledge has\nnot been previously accomplished.\n","authors":["Alexandros Stergiou","Nikos Deligiannis"],"pdf_url":"https://arxiv.org/pdf/2303.09941v4.pdf","comment":"Accepted at IEEE/CVF International Conference on Computer Vision\n  (ICCV) 2023"},{"id":"http://arxiv.org/abs/2307.12348v2","updated":"2023-07-25T11:58:07Z","published":"2023-07-23T15:10:02Z","title":"ResShift: Efficient Diffusion Model for Image Super-resolution by\n  Residual Shifting","summary":"  Diffusion-based image super-resolution (SR) methods are mainly limited by the\nlow inference speed due to the requirements of hundreds or even thousands of\nsampling steps. Existing acceleration sampling techniques inevitably sacrifice\nperformance to some extent, leading to over-blurry SR results. To address this\nissue, we propose a novel and efficient diffusion model for SR that\nsignificantly reduces the number of diffusion steps, thereby eliminating the\nneed for post-acceleration during inference and its associated performance\ndeterioration. Our method constructs a Markov chain that transfers between the\nhigh-resolution image and the low-resolution image by shifting the residual\nbetween them, substantially improving the transition efficiency. Additionally,\nan elaborate noise schedule is developed to flexibly control the shifting speed\nand the noise strength during the diffusion process. Extensive experiments\ndemonstrate that the proposed method obtains superior or at least comparable\nperformance to current state-of-the-art methods on both synthetic and\nreal-world datasets, even only with 15 sampling steps. Our code and model are\navailable at https://github.com/zsyOAOA/ResShift.\n","authors":["Zongsheng Yue","Jianyi Wang","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2307.12348v2.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.13428v1","updated":"2023-07-25T11:51:14Z","published":"2023-07-25T11:51:14Z","title":"An Explainable Model-Agnostic Algorithm for CNN-based Biometrics\n  Verification","summary":"  This paper describes an adaptation of the Local Interpretable Model-Agnostic\nExplanations (LIME) AI method to operate under a biometric verification\nsetting. LIME was initially proposed for networks with the same output classes\nused for training, and it employs the softmax probability to determine which\nregions of the image contribute the most to classification. However, in a\nverification setting, the classes to be recognized have not been seen during\ntraining. In addition, instead of using the softmax output, face descriptors\nare usually obtained from a layer before the classification layer. The model is\nadapted to achieve explainability via cosine similarity between feature vectors\nof perturbated versions of the input image. The method is showcased for face\nbiometrics with two CNN models based on MobileNetv2 and ResNet50.\n","authors":["Fernando Alonso-Fernandez","Kevin Hernandez-Diaz","Jose M. Buades","Prayag Tiwari","Josef Bigun"],"pdf_url":"https://arxiv.org/pdf/2307.13428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13425v1","updated":"2023-07-25T11:45:28Z","published":"2023-07-25T11:45:28Z","title":"A signal processing interpretation of noise-reduction convolutional\n  neural networks","summary":"  Encoding-decoding CNNs play a central role in data-driven noise reduction and\ncan be found within numerous deep-learning algorithms. However, the development\nof these CNN architectures is often done in ad-hoc fashion and theoretical\nunderpinnings for important design choices is generally lacking. Up to this\nmoment there are different existing relevant works that strive to explain the\ninternal operation of these CNNs. Still, these ideas are either scattered\nand/or may require significant expertise to be accessible for a bigger\naudience. In order to open up this exciting field, this article builds\nintuition on the theory of deep convolutional framelets and explains diverse ED\nCNN architectures in a unified theoretical framework. By connecting basic\nprinciples from signal processing to the field of deep learning, this\nself-contained material offers significant guidance for designing robust and\nefficient novel CNN architectures.\n","authors":["Luis A. Zavala-Mondragón","Peter H. N. de With","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2307.13425v1.pdf","comment":"This article is currently accepted in IEEE Signal Processing Magazine\n  (SPM)"},{"id":"http://arxiv.org/abs/2307.13412v1","updated":"2023-07-25T11:19:21Z","published":"2023-07-25T11:19:21Z","title":"Mitigating Memory Wall Effects in CNN Engines with On-the-Fly Weights\n  Generation","summary":"  The unprecedented accuracy of convolutional neural networks (CNNs) across a\nbroad range of AI tasks has led to their widespread deployment in mobile and\nembedded settings. In a pursuit for high-performance and energy-efficient\ninference, significant research effort has been invested in the design of\nFPGA-based CNN accelerators. In this context, single computation engines\nconstitute a popular approach to support diverse CNN modes without the overhead\nof fabric reconfiguration. Nevertheless, this flexibility often comes with\nsignificantly degraded performance on memory-bound layers and resource\nunderutilisation due to the suboptimal mapping of certain layers on the\nengine's fixed configuration. In this work, we investigate the implications in\nterms of CNN engine design for a class of models that introduce a\npre-convolution stage to decompress the weights at run time. We refer to these\napproaches as on-the-fly. This paper presents unzipFPGA, a novel CNN inference\nsystem that counteracts the limitations of existing CNN engines. The proposed\nframework comprises a novel CNN hardware architecture that introduces a weights\ngenerator module that enables the on-chip on-the-fly generation of weights,\nalleviating the negative impact of limited bandwidth on memory-bound layers. We\nfurther enhance unzipFPGA with an automated hardware-aware methodology that\ntailors the weights generation mechanism to the target CNN-device pair, leading\nto an improved accuracy-performance balance. Finally, we introduce an input\nselective processing element (PE) design that balances the load between PEs in\nsuboptimally mapped layers. The proposed framework yields hardware designs that\nachieve an average of 2.57x performance efficiency gain over highly optimised\nGPU designs for the same power constraints and up to 3.94x higher performance\ndensity over a diverse range of state-of-the-art FPGA-based CNN accelerators.\n","authors":["Stylianos I. Venieris","Javier Fernandez-Marques","Nicholas D. Lane"],"pdf_url":"https://arxiv.org/pdf/2307.13412v1.pdf","comment":"Accepted at ACM TODAES, 2023. arXiv admin note: substantial text\n  overlap with arXiv:2103.05600"},{"id":"http://arxiv.org/abs/2307.13397v1","updated":"2023-07-25T10:31:45Z","published":"2023-07-25T10:31:45Z","title":"Scoring Cycling Environments Perceived Safety using Pairwise Image\n  Comparisons","summary":"  Today, many cities seek to transition to more sustainable transportation\nsystems. Cycling is critical in this transition for shorter trips, including\nfirst-and-last-mile links to transit. Yet, if individuals perceive cycling as\nunsafe, they will not cycle and choose other transportation modes. This study\npresents a novel approach to identifying how the perception of cycling safety\ncan be analyzed and understood and the impact of the built environment and\ncycling contexts on such perceptions. We base our work on other perception\nstudies and pairwise comparisons, using real-world images to survey\nrespondents. We repeatedly show respondents two road environments and ask them\nto select the one they perceive as safer for cycling. We compare several\nmethods capable of rating cycling environments from pairwise comparisons and\nclassify cycling environments perceived as safe or unsafe. Urban planning can\nuse this score to improve interventions' effectiveness and improve cycling\npromotion campaigns. Furthermore, this approach facilitates the continuous\nassessment of changing cycling environments, allows for a short-term evaluation\nof measures, and is efficiently deployed in different locations or contexts.\n","authors":["Miguel Costa","Manuel Marques","Felix Wilhelm Siebert","Carlos Lima Azevedo","Filipe Moura"],"pdf_url":"https://arxiv.org/pdf/2307.13397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13375v1","updated":"2023-07-25T09:48:13Z","published":"2023-07-25T09:48:13Z","title":"Towards Unifying Anatomy Segmentation: Automated Generation of a\n  Full-body CT Dataset via Knowledge Aggregation and Anatomical Guidelines","summary":"  In this study, we present a method for generating automated anatomy\nsegmentation datasets using a sequential process that involves nnU-Net-based\npseudo-labeling and anatomy-guided pseudo-label refinement. By combining\nvarious fragmented knowledge bases, we generate a dataset of whole-body CT\nscans with $142$ voxel-level labels for 533 volumes providing comprehensive\nanatomical coverage which experts have approved. Our proposed procedure does\nnot rely on manual annotation during the label aggregation stage. We examine\nits plausibility and usefulness using three complementary checks: Human expert\nevaluation which approved the dataset, a Deep Learning usefulness benchmark on\nthe BTCV dataset in which we achieve 85% dice score without using its training\ndataset, and medical validity checks. This evaluation procedure combines\nscalable automated checks with labor-intensive high-quality expert checks.\nBesides the dataset, we release our trained unified anatomical segmentation\nmodel capable of predicting $142$ anatomical structures on CT data.\n","authors":["Alexander Jaus","Constantin Seibold","Kelsey Hermann","Alexandra Walter","Kristina Giske","Johannes Haubold","Jens Kleesiek","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2307.13375v1.pdf","comment":"18 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.13368v1","updated":"2023-07-25T09:39:59Z","published":"2023-07-25T09:39:59Z","title":"Kefa: A Knowledge Enhanced and Fine-grained Aligned Speaker for\n  Navigation Instruction Generation","summary":"  We introduce a novel speaker model \\textsc{Kefa} for navigation instruction\ngeneration. The existing speaker models in Vision-and-Language Navigation\nsuffer from the large domain gap of vision features between different\nenvironments and insufficient temporal grounding capability. To address the\nchallenges, we propose a Knowledge Refinement Module to enhance the feature\nrepresentation with external knowledge facts, and an Adaptive Temporal\nAlignment method to enforce fine-grained alignment between the generated\ninstructions and the observation sequences. Moreover, we propose a new metric\nSPICE-D for navigation instruction evaluation, which is aware of the\ncorrectness of direction phrases. The experimental results on R2R and UrbanWalk\ndatasets show that the proposed KEFA speaker achieves state-of-the-art\ninstruction generation performance for both indoor and outdoor scenes.\n","authors":["Haitian Zeng","Xiaohan Wang","Wenguan Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2307.13368v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2304.07221v2","updated":"2023-07-25T09:34:38Z","published":"2023-04-14T16:03:09Z","title":"Instance-aware Dynamic Prompt Tuning for Pre-trained Point Cloud Models","summary":"  Pre-trained point cloud models have found extensive applications in 3D\nunderstanding tasks like object classification and part segmentation. However,\nthe prevailing strategy of full fine-tuning in downstream tasks leads to large\nper-task storage overhead for model parameters, which limits the efficiency\nwhen applying large-scale pre-trained models. Inspired by the recent success of\nvisual prompt tuning (VPT), this paper attempts to explore prompt tuning on\npre-trained point cloud models, to pursue an elegant balance between\nperformance and parameter efficiency. We find while instance-agnostic static\nprompting, e.g. VPT, shows some efficacy in downstream transfer, it is\nvulnerable to the distribution diversity caused by various types of noises in\nreal-world point cloud data. To conquer this limitation, we propose a novel\nInstance-aware Dynamic Prompt Tuning (IDPT) strategy for pre-trained point\ncloud models. The essence of IDPT is to develop a dynamic prompt generation\nmodule to perceive semantic prior features of each point cloud instance and\ngenerate adaptive prompt tokens to enhance the model's robustness. Notably,\nextensive experiments demonstrate that IDPT outperforms full fine-tuning in\nmost tasks with a mere 7% of the trainable parameters, providing a promising\nsolution to parameter-efficient learning for pre-trained point cloud models.\nCode is available at \\url{https://github.com/zyh16143998882/ICCV23-IDPT}.\n","authors":["Yaohua Zha","Jinpeng Wang","Tao Dai","Bin Chen","Zhi Wang","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2304.07221v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13363v1","updated":"2023-07-25T09:33:25Z","published":"2023-07-25T09:33:25Z","title":"3DRP-Net: 3D Relative Position-aware Network for 3D Visual Grounding","summary":"  3D visual grounding aims to localize the target object in a 3D point cloud by\na free-form language description. Typically, the sentences describing the\ntarget object tend to provide information about its relative relation between\nother objects and its position within the whole scene. In this work, we propose\na relation-aware one-stage framework, named 3D Relative Position-aware Network\n(3DRP-Net), which can effectively capture the relative spatial relationships\nbetween objects and enhance object attributes. Specifically, 1) we propose a 3D\nRelative Position Multi-head Attention (3DRP-MA) module to analyze relative\nrelations from different directions in the context of object pairs, which helps\nthe model to focus on the specific object relations mentioned in the sentence.\n2) We designed a soft-labeling strategy to alleviate the spatial ambiguity\ncaused by redundant points, which further stabilizes and enhances the learning\nprocess through a constant and discriminative distribution. Extensive\nexperiments conducted on three benchmarks (i.e., ScanRefer and Nr3D/Sr3D)\ndemonstrate that our method outperforms all the state-of-the-art methods in\ngeneral. The source code will be released on GitHub.\n","authors":["Zehan Wang","Haifeng Huang","Yang Zhao","Linjun Li","Xize Cheng","Yichen Zhu","Aoxiong Yin","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.13363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13361v1","updated":"2023-07-25T09:31:55Z","published":"2023-07-25T09:31:55Z","title":"Of Mice and Pose: 2D Mouse Pose Estimation from Unlabelled Data and\n  Synthetic Prior","summary":"  Numerous fields, such as ecology, biology, and neuroscience, use animal\nrecordings to track and measure animal behaviour. Over time, a significant\nvolume of such data has been produced, but some computer vision techniques\ncannot explore it due to the lack of annotations. To address this, we propose\nan approach for estimating 2D mouse body pose from unlabelled images using a\nsynthetically generated empirical pose prior. Our proposal is based on a recent\nself-supervised method for estimating 2D human pose that uses single images and\na set of unpaired typical 2D poses within a GAN framework. We adapt this method\nto the limb structure of the mouse and generate the empirical prior of 2D poses\nfrom a synthetic 3D mouse model, thereby avoiding manual annotation. In\nexperiments on a new mouse video dataset, we evaluate the performance of the\napproach by comparing pose predictions to a manually obtained ground truth. We\nalso compare predictions with those from a supervised state-of-the-art method\nfor animal pose estimation. The latter evaluation indicates promising results\ndespite the lack of paired training data. Finally, qualitative results using a\ndataset of horse images show the potential of the setting to adapt to other\nanimal species.\n","authors":["Jose Sosa","Sharn Perry","Jane Alty","David Hogg"],"pdf_url":"https://arxiv.org/pdf/2307.13361v1.pdf","comment":"Accepted at the International Conference on Computer Vision Systems\n  2023"},{"id":"http://arxiv.org/abs/2109.12965v2","updated":"2023-07-25T09:27:12Z","published":"2021-09-27T11:42:40Z","title":"Text-based Person Search in Full Images via Semantic-Driven Proposal\n  Generation","summary":"  Finding target persons in full scene images with a query of text description\nhas important practical applications in intelligent video surveillance.However,\ndifferent from the real-world scenarios where the bounding boxes are not\navailable, existing text-based person retrieval methods mainly focus on the\ncross modal matching between the query text descriptions and the gallery of\ncropped pedestrian images. To close the gap, we study the problem of text-based\nperson search in full images by proposing a new end-to-end learning framework\nwhich jointly optimize the pedestrian detection, identification and\nvisual-semantic feature embedding tasks. To take full advantage of the query\ntext, the semantic features are leveraged to instruct the Region Proposal\nNetwork to pay more attention to the text-described proposals. Besides, a\ncross-scale visual-semantic embedding mechanism is utilized to improve the\nperformance. To validate the proposed method, we collect and annotate two\nlarge-scale benchmark datasets based on the widely adopted image-based person\nsearch datasets CUHK-SYSU and PRW. Comprehensive experiments are conducted on\nthe two datasets and compared with the baseline methods, our method achieves\nthe state-of-the-art performance.\n","authors":["Shizhou Zhang","De Cheng","Wenlong Luo","Yinghui Xing","Duo Long","Hao Li","Kai Niu","Guoqiang Liang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2109.12965v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13345v1","updated":"2023-07-25T09:02:29Z","published":"2023-07-25T09:02:29Z","title":"Do humans and Convolutional Neural Networks attend to similar areas\n  during scene classification: Effects of task and image type","summary":"  Deep Learning models like Convolutional Neural Networks (CNN) are powerful\nimage classifiers, but what factors determine whether they attend to similar\nimage areas as humans do? While previous studies have focused on technological\nfactors, little is known about the role of factors that affect human attention.\nIn the present study, we investigated how the tasks used to elicit human\nattention maps interact with image characteristics in modulating the similarity\nbetween humans and CNN. We varied the intentionality of human tasks, ranging\nfrom spontaneous gaze during categorization over intentional gaze-pointing up\nto manual area selection. Moreover, we varied the type of image to be\ncategorized, using either singular, salient objects, indoor scenes consisting\nof object arrangements, or landscapes without distinct objects defining the\ncategory. The human attention maps generated in this way were compared to the\nCNN attention maps revealed by explainable artificial intelligence (Grad-CAM).\nThe influence of human tasks strongly depended on image type: For objects,\nhuman manual selection produced maps that were most similar to CNN, while the\nspecific eye movement task has little impact. For indoor scenes, spontaneous\ngaze produced the least similarity, while for landscapes, similarity was\nequally low across all human tasks. To better understand these results, we also\ncompared the different human attention maps to each other. Our results\nhighlight the importance of taking human factors into account when comparing\nthe attention of humans and CNN.\n","authors":["Romy Müller","Marcel Duerschmidt","Julian Ullrich","Carsten Knoll","Sascha Weber","Steffen Seitz"],"pdf_url":"https://arxiv.org/pdf/2307.13345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13344v1","updated":"2023-07-25T08:58:26Z","published":"2023-07-25T08:58:26Z","title":"Prior Based Online Lane Graph Extraction from Single Onboard Camera\n  Image","summary":"  The local road network information is essential for autonomous navigation.\nThis information is commonly obtained from offline HD-Maps in terms of lane\ngraphs. However, the local road network at a given moment can be drastically\ndifferent than the one given in the offline maps; due to construction works,\naccidents etc. Moreover, the autonomous vehicle might be at a location not\ncovered in the offline HD-Map. Thus, online estimation of the lane graph is\ncrucial for widespread and reliable autonomous navigation. In this work, we\ntackle online Bird's-Eye-View lane graph extraction from a single onboard\ncamera image. We propose to use prior information to increase quality of the\nestimations. The prior is extracted from the dataset through a transformer\nbased Wasserstein Autoencoder. The autoencoder is then used to enhance the\ninitial lane graph estimates. This is done through optimization of the latent\nspace vector. The optimization encourages the lane graph estimation to be\nlogical by discouraging it to diverge from the prior distribution. We test the\nmethod on two benchmark datasets, NuScenes and Argoverse. The results show that\nthe proposed method significantly improves the performance compared to\nstate-of-the-art methods.\n","authors":["Yigit Baran Can","Alexander Liniger","Danda Pani Paudel","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2307.13344v1.pdf","comment":"ITSC 2023"},{"id":"http://arxiv.org/abs/2208.06643v4","updated":"2023-07-25T08:53:16Z","published":"2022-08-13T13:13:41Z","title":"Recent Progress in Transformer-based Medical Image Analysis","summary":"  The transformer is primarily used in the field of natural language\nprocessing. Recently, it has been adopted and shows promise in the computer\nvision (CV) field. Medical image analysis (MIA), as a critical branch of CV,\nalso greatly benefits from this state-of-the-art technique. In this review, we\nfirst recap the core component of the transformer, the attention mechanism, and\nthe detailed structures of the transformer. After that, we depict the recent\nprogress of the transformer in the field of MIA. We organize the applications\nin a sequence of different tasks, including classification, segmentation,\ncaptioning, registration, detection, enhancement, localization, and synthesis.\nThe mainstream classification and segmentation tasks are further divided into\neleven medical image modalities. A large number of experiments studied in this\nreview illustrate that the transformer-based method outperforms existing\nmethods through comparisons with multiple evaluation metrics. Finally, we\ndiscuss the open challenges and future opportunities in this field. This\ntask-modality review with the latest contents, detailed information, and\ncomprehensive comparison may greatly benefit the broad MIA community.\n","authors":["Zhaoshan Liu","Qiujie Lv","Ziduo Yang","Yifan Li","Chau Hung Lee","Lei Shen"],"pdf_url":"https://arxiv.org/pdf/2208.06643v4.pdf","comment":"Computers in Biology and Medicine Accepted"},{"id":"http://arxiv.org/abs/2307.13337v1","updated":"2023-07-25T08:50:01Z","published":"2023-07-25T08:50:01Z","title":"Overcoming Distribution Mismatch in Quantizing Image Super-Resolution\n  Networks","summary":"  Quantization is a promising approach to reduce the high computational\ncomplexity of image super-resolution (SR) networks. However, compared to\nhigh-level tasks like image classification, low-bit quantization leads to\nsevere accuracy loss in SR networks. This is because feature distributions of\nSR networks are significantly divergent for each channel or input image, and is\nthus difficult to determine a quantization range. Existing SR quantization\nworks approach this distribution mismatch problem by dynamically adapting\nquantization ranges to the variant distributions during test time. However,\nsuch dynamic adaptation incurs additional computational costs that limit the\nbenefits of quantization. Instead, we propose a new quantization-aware training\nframework that effectively Overcomes the Distribution Mismatch problem in SR\nnetworks without the need for dynamic adaptation. Intuitively, the mismatch can\nbe reduced by directly regularizing the variance in features during training.\nHowever, we observe that variance regularization can collide with the\nreconstruction loss during training and adversely impact SR accuracy. Thus, we\navoid the conflict between two losses by regularizing the variance only when\nthe gradients of variance regularization are cooperative with that of\nreconstruction. Additionally, to further reduce the distribution mismatch, we\nintroduce distribution offsets to layers with a significant mismatch, which\neither scales or shifts channel-wise features. Our proposed algorithm, called\nODM, effectively reduces the mismatch in distributions with minimal\ncomputational overhead. Experimental results show that ODM effectively\noutperforms existing SR quantization approaches with similar or fewer\ncomputations, demonstrating the importance of reducing the distribution\nmismatch problem. Our code is available at https://github.com/Cheeun/ODM.\n","authors":["Cheeun Hong","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2307.13337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12853v2","updated":"2023-07-25T08:48:11Z","published":"2023-07-24T14:53:23Z","title":"Spatiotemporal Modeling Encounters 3D Medical Image Analysis:\n  Slice-Shift UNet with Multi-View Fusion","summary":"  As a fundamental part of computational healthcare, Computer Tomography (CT)\nand Magnetic Resonance Imaging (MRI) provide volumetric data, making the\ndevelopment of algorithms for 3D image analysis a necessity. Despite being\ncomputationally cheap, 2D Convolutional Neural Networks can only extract\nspatial information. In contrast, 3D CNNs can extract three-dimensional\nfeatures, but they have higher computational costs and latency, which is a\nlimitation for clinical practice that requires fast and efficient models.\nInspired by the field of video action recognition we propose a new 2D-based\nmodel dubbed Slice SHift UNet (SSH-UNet) which encodes three-dimensional\nfeatures at 2D CNN's complexity. More precisely multi-view features are\ncollaboratively learned by performing 2D convolutions along the three\northogonal planes of a volume and imposing a weights-sharing mechanism. The\nthird dimension, which is neglected by the 2D convolution, is reincorporated by\nshifting a portion of the feature maps along the slices' axis. The\neffectiveness of our approach is validated in Multi-Modality Abdominal\nMulti-Organ Segmentation (AMOS) and Multi-Atlas Labeling Beyond the Cranial\nVault (BTCV) datasets, showing that SSH-UNet is more efficient while on par in\nperformance with state-of-the-art architectures.\n","authors":["C. I. Ugwu","S. Casarin","O. Lanz"],"pdf_url":"https://arxiv.org/pdf/2307.12853v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16759v2","updated":"2023-07-25T08:39:31Z","published":"2023-05-26T09:21:56Z","title":"StyleHumanCLIP: Text-guided Garment Manipulation for StyleGAN-Human","summary":"  This paper tackles text-guided control of StyleGAN for editing garments in\nfull-body human images. Existing StyleGAN-based methods suffer from handling\nthe rich diversity of garments and body shapes and poses. We propose a\nframework for text-guided full-body human image synthesis via an\nattention-based latent code mapper, which enables more disentangled control of\nStyleGAN than existing mappers. Our latent code mapper adopts an attention\nmechanism that adaptively manipulates individual latent codes on different\nStyleGAN layers under text guidance. In addition, we introduce feature-space\nmasking at inference time to avoid unwanted changes caused by text inputs. Our\nquantitative and qualitative evaluations reveal that our method can control\ngenerated images more faithfully to given texts than existing methods.\n","authors":["Takato Yoshikawa","Yuki Endo","Yoshihiro Kanamori"],"pdf_url":"https://arxiv.org/pdf/2305.16759v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13316v1","updated":"2023-07-25T08:23:10Z","published":"2023-07-25T08:23:10Z","title":"Unmasking Anomalies in Road-Scene Segmentation","summary":"  Anomaly segmentation is a critical task for driving applications, and it is\napproached traditionally as a per-pixel classification problem. However,\nreasoning individually about each pixel without considering their contextual\nsemantics results in high uncertainty around the objects' boundaries and\nnumerous false positives. We propose a paradigm change by shifting from a\nper-pixel classification to a mask classification. Our mask-based method,\nMask2Anomaly, demonstrates the feasibility of integrating an anomaly detection\nmethod in a mask-classification architecture. Mask2Anomaly includes several\ntechnical novelties that are designed to improve the detection of anomalies in\nmasks: i) a global masked attention module to focus individually on the\nforeground and background regions; ii) a mask contrastive learning that\nmaximizes the margin between an anomaly and known classes; and iii) a mask\nrefinement solution to reduce false positives. Mask2Anomaly achieves new\nstate-of-the-art results across a range of benchmarks, both in the per-pixel\nand component-level evaluations. In particular, Mask2Anomaly reduces the\naverage false positives rate by 60% wrt the previous state-of-the-art. Github\npage:\nhttps://github.com/shyam671/Mask2Anomaly-Unmasking-Anomalies-in-Road-Scene-Segmentation.\n","authors":["Shyam Nandan Rai","Fabio Cermelli","Dario Fontanel","Carlo Masone","Barbara Caputo"],"pdf_url":"https://arxiv.org/pdf/2307.13316v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13314v1","updated":"2023-07-25T08:15:55Z","published":"2023-07-25T08:15:55Z","title":"Mitigating Cross-client GANs-based Attack in Federated Learning","summary":"  Machine learning makes multimedia data (e.g., images) more attractive,\nhowever, multimedia data is usually distributed and privacy sensitive. Multiple\ndistributed multimedia clients can resort to federated learning (FL) to jointly\nlearn a global shared model without requiring to share their private samples\nwith any third-party entities. In this paper, we show that FL suffers from the\ncross-client generative adversarial networks (GANs)-based (C-GANs) attack, in\nwhich a malicious client (i.e., adversary) can reconstruct samples with the\nsame distribution as the training samples from other clients (i.e., victims).\nSince a benign client's data can be leaked to the adversary, this attack brings\nthe risk of local data leakage for clients in many security-critical FL\napplications. Thus, we propose Fed-EDKD (i.e., Federated Ensemble Data-free\nKnowledge Distillation) technique to improve the current popular FL schemes to\nresist C-GANs attack. In Fed-EDKD, each client submits a local model to the\nserver for obtaining an ensemble global model. Then, to avoid model expansion,\nFed-EDKD adopts data-free knowledge distillation techniques to transfer\nknowledge from the ensemble global model to a compressed model. By this way,\nFed-EDKD reduces the adversary's control capability over the global model, so\nFed-EDKD can effectively mitigate C-GANs attack. Finally, the experimental\nresults demonstrate that Fed-EDKD significantly mitigates C-GANs attack while\nonly incurring a slight accuracy degradation of FL.\n","authors":["Hong Huang","Xinyu Lei","Tao Xiang"],"pdf_url":"https://arxiv.org/pdf/2307.13314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13310v1","updated":"2023-07-25T08:00:40Z","published":"2023-07-25T08:00:40Z","title":"CT-Net: Arbitrary-Shaped Text Detection via Contour Transformer","summary":"  Contour based scene text detection methods have rapidly developed recently,\nbut still suffer from inaccurate frontend contour initialization, multi-stage\nerror accumulation, or deficient local information aggregation. To tackle these\nlimitations, we propose a novel arbitrary-shaped scene text detection framework\nnamed CT-Net by progressive contour regression with contour transformers.\nSpecifically, we first employ a contour initialization module that generates\ncoarse text contours without any post-processing. Then, we adopt contour\nrefinement modules to adaptively refine text contours in an iterative manner,\nwhich are beneficial for context information capturing and progressive global\ncontour deformation. Besides, we propose an adaptive training strategy to\nenable the contour transformers to learn more potential deformation paths, and\nintroduce a re-score mechanism that can effectively suppress false positives.\nExtensive experiments are conducted on four challenging datasets, which\ndemonstrate the accuracy and efficiency of our CT-Net over state-of-the-art\nmethods. Particularly, CT-Net achieves F-measure of 86.1 at 11.2 frames per\nsecond (FPS) and F-measure of 87.8 at 10.1 FPS for CTW1500 and Total-Text\ndatasets, respectively.\n","authors":["Zhiwen Shao","Yuchen Su","Yong Zhou","Fanrong Meng","Hancheng Zhu","Bing Liu","Rui Yao"],"pdf_url":"https://arxiv.org/pdf/2307.13310v1.pdf","comment":"This paper has been accepted by IEEE Transactions on Circuits and\n  Systems for Video Technology"},{"id":"http://arxiv.org/abs/2307.13300v1","updated":"2023-07-25T07:30:28Z","published":"2023-07-25T07:30:28Z","title":"Mini-PointNetPlus: a local feature descriptor in deep learning model for\n  3d environment perception","summary":"  Common deep learning models for 3D environment perception often use\npillarization/voxelization methods to convert point cloud data into\npillars/voxels and then process it with a 2D/3D convolutional neural network\n(CNN). The pioneer work PointNet has been widely applied as a local feature\ndescriptor, a fundamental component in deep learning models for 3D perception,\nto extract features of a point cloud. This is achieved by using a symmetric\nmax-pooling operator which provides unique pillar/voxel features. However, by\nignoring most of the points, the max-pooling operator causes an information\nloss, which reduces the model performance. To address this issue, we propose a\nnovel local feature descriptor, mini-PointNetPlus, as an alternative for\nplug-and-play to PointNet. Our basic idea is to separately project the data\npoints to the individual features considered, each leading to a permutation\ninvariant. Thus, the proposed descriptor transforms an unordered point cloud to\na stable order. The vanilla PointNet is proved to be a special case of our\nmini-PointNetPlus. Due to fully utilizing the features by the proposed\ndescriptor, we demonstrate in experiment a considerable performance improvement\nfor 3D perception.\n","authors":["Chuanyu Luo","Nuo Cheng","Sikun Ma","Jun Xiang","Xiaohan Li","Shengguang Lei","Pu Li"],"pdf_url":"https://arxiv.org/pdf/2307.13300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13294v1","updated":"2023-07-25T07:20:21Z","published":"2023-07-25T07:20:21Z","title":"Imperceptible Physical Attack against Face Recognition Systems via LED\n  Illumination Modulation","summary":"  Although face recognition starts to play an important role in our daily life,\nwe need to pay attention that data-driven face recognition vision systems are\nvulnerable to adversarial attacks. However, the current two categories of\nadversarial attacks, namely digital attacks and physical attacks both have\ndrawbacks, with the former ones impractical and the latter one conspicuous,\nhigh-computational and inexecutable. To address the issues, we propose a\npractical, executable, inconspicuous and low computational adversarial attack\nbased on LED illumination modulation. To fool the systems, the proposed attack\ngenerates imperceptible luminance changes to human eyes through fast intensity\nmodulation of scene LED illumination and uses the rolling shutter effect of\nCMOS image sensors in face recognition systems to implant luminance information\nperturbation to the captured face images. In summary,we present a\ndenial-of-service (DoS) attack for face detection and a dodging attack for face\nverification. We also evaluate their effectiveness against well-known face\ndetection models, Dlib, MTCNN and RetinaFace , and face verification models,\nDlib, FaceNet,and ArcFace.The extensive experiments show that the success rates\nof DoS attacks against face detection models reach 97.67%, 100%, and 100%,\nrespectively, and the success rates of dodging attacks against all face\nverification models reach 100%.\n","authors":["Junbin Fang","Canjian Jiang","You Jiang","Puxi Lin","Zhaojie Chen","Yujing Sun","Siu-Ming Yiu","Zoe L. Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.13294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06872v4","updated":"2023-07-25T07:07:12Z","published":"2023-03-13T05:46:21Z","title":"FusionLoc: Camera-2D LiDAR Fusion Using Multi-Head Self-Attention for\n  End-to-End Serving Robot Relocalization","summary":"  As technology advances in autonomous mobile robots, mobile service robots\nhave been actively used more and more for various purposes. Especially, serving\nrobots have been not surprising products anymore since the COVID-19 pandemic.\nOne of the practical problems in operating a serving robot is that it often\nfails to estimate its pose on a map that it moves around. Whenever the failure\nhappens, servers should bring the serving robot to its initial location and\nreboot it manually. In this paper, we focus on end-to-end relocalization of\nserving robots to address the problem. It is to predict robot pose directly\nfrom only the onboard sensor data using neural networks. In particular, we\npropose a deep neural network architecture for the relocalization based on\ncamera-2D LiDAR sensor fusion. We call the proposed method FusionLoc. In the\nproposed method, the multi-head self-attention complements different types of\ninformation captured by the two sensors to regress the robot pose. Our\nexperiments on a dataset collected by a commercial serving robot demonstrate\nthat FusionLoc can provide better performances than previous end-to-end\nrelocalization methods taking only a single image or a 2D LiDAR point cloud as\nwell as a straightforward fusion method concatenating their features.\n","authors":["Jieun Lee","Hakjun Lee","Jiyong Oh"],"pdf_url":"https://arxiv.org/pdf/2303.06872v4.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.13282v1","updated":"2023-07-25T06:37:50Z","published":"2023-07-25T06:37:50Z","title":"High-Resolution Volumetric Reconstruction for Clothed Humans","summary":"  We present a novel method for reconstructing clothed humans from a sparse set\nof, e.g., 1 to 6 RGB images. Despite impressive results from recent works\nemploying deep implicit representation, we revisit the volumetric approach and\ndemonstrate that better performance can be achieved with proper system design.\nThe volumetric representation offers significant advantages in leveraging 3D\nspatial context through 3D convolutions, and the notorious quantization error\nis largely negligible with a reasonably large yet affordable volume resolution,\ne.g., 512. To handle memory and computation costs, we propose a sophisticated\ncoarse-to-fine strategy with voxel culling and subspace sparse convolution. Our\nmethod starts with a discretized visual hull to compute a coarse shape and then\nfocuses on a narrow band nearby the coarse shape for refinement. Once the shape\nis reconstructed, we adopt an image-based rendering approach, which computes\nthe colors of surface points by blending input images with learned weights.\nExtensive experimental results show that our method significantly reduces the\nmean point-to-surface (P2S) precision of state-of-the-art methods by more than\n50% to achieve approximately 2mm accuracy with a 512 volume resolution.\nAdditionally, images rendered from our textured model achieve a higher peak\nsignal-to-noise ratio (PSNR) compared to state-of-the-art methods.\n","authors":["Sicong Tang","Guangyuan Wang","Qing Ran","Lingzhi Li","Li Shen","Ping Tan"],"pdf_url":"https://arxiv.org/pdf/2307.13282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12316v2","updated":"2023-07-25T06:36:01Z","published":"2023-07-23T12:57:47Z","title":"Development of pericardial fat count images using a combination of three\n  different deep-learning models","summary":"  Rationale and Objectives: Pericardial fat (PF), the thoracic visceral fat\nsurrounding the heart, promotes the development of coronary artery disease by\ninducing inflammation of the coronary arteries. For evaluating PF, this study\naimed to generate pericardial fat count images (PFCIs) from chest radiographs\n(CXRs) using a dedicated deep-learning model.\n  Materials and Methods: The data of 269 consecutive patients who underwent\ncoronary computed tomography (CT) were reviewed. Patients with metal implants,\npleural effusion, history of thoracic surgery, or that of malignancy were\nexcluded. Thus, the data of 191 patients were used. PFCIs were generated from\nthe projection of three-dimensional CT images, where fat accumulation was\nrepresented by a high pixel value. Three different deep-learning models,\nincluding CycleGAN, were combined in the proposed method to generate PFCIs from\nCXRs. A single CycleGAN-based model was used to generate PFCIs from CXRs for\ncomparison with the proposed method. To evaluate the image quality of the\ngenerated PFCIs, structural similarity index measure (SSIM), mean squared error\n(MSE), and mean absolute error (MAE) of (i) the PFCI generated using the\nproposed method and (ii) the PFCI generated using the single model were\ncompared.\n  Results: The mean SSIM, MSE, and MAE were as follows: 0.856, 0.0128, and\n0.0357, respectively, for the proposed model; and 0.762, 0.0198, and 0.0504,\nrespectively, for the single CycleGAN-based model.\n  Conclusion: PFCIs generated from CXRs with the proposed model showed better\nperformance than those with the single model. PFCI evaluation without CT may be\npossible with the proposed method.\n","authors":["Takaaki Matsunaga","Atsushi Kono","Hidetoshi Matsuo","Kaoru Kitagawa","Mizuho Nishio","Hiromi Hashimura","Yu Izawa","Takayoshi Toba","Kazuki Ishikawa","Akie Katsuki","Kazuyuki Ohmura","Takamichi Murakami"],"pdf_url":"https://arxiv.org/pdf/2307.12316v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11881v2","updated":"2023-07-25T05:13:12Z","published":"2023-07-21T19:37:23Z","title":"Selecting the motion ground truth for loose-fitting wearables:\n  benchmarking optical MoCap methods","summary":"  To help smart wearable researchers choose the optimal ground truth methods\nfor motion capturing (MoCap) for all types of loose garments, we present a\nbenchmark, DrapeMoCapBench (DMCB), specifically designed to evaluate the\nperformance of optical marker-based and marker-less MoCap. High-cost\nmarker-based MoCap systems are well-known as precise golden standards. However,\na less well-known caveat is that they require skin-tight fitting markers on\nbony areas to ensure the specified precision, making them questionable for\nloose garments. On the other hand, marker-less MoCap methods powered by\ncomputer vision models have matured over the years, which have meager costs as\nsmartphone cameras would suffice. To this end, DMCB uses large real-world\nrecorded MoCap datasets to perform parallel 3D physics simulations with a wide\nrange of diversities: six levels of drape from skin-tight to extremely draped\ngarments, three levels of motions and six body type - gender combinations to\nbenchmark state-of-the-art optical marker-based and marker-less MoCap methods\nto identify the best-performing method in different scenarios. In assessing the\nperformance of marker-based and low-cost marker-less MoCap for casual loose\ngarments both approaches exhibit significant performance loss (>10cm), but for\neveryday activities involving basic and fast motions, marker-less MoCap\nslightly outperforms marker-based MoCap, making it a favorable and\ncost-effective choice for wearable studies.\n","authors":["Lala Shakti Swarup Ray","Bo Zhou","Sungho Suh","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2307.11881v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13259v1","updated":"2023-07-25T05:05:07Z","published":"2023-07-25T05:05:07Z","title":"GaitFormer: Revisiting Intrinsic Periodicity for Gait Recognition","summary":"  Gait recognition aims to distinguish different walking patterns by analyzing\nvideo-level human silhouettes, rather than relying on appearance information.\nPrevious research on gait recognition has primarily focused on extracting local\nor global spatial-temporal representations, while overlooking the intrinsic\nperiodic features of gait sequences, which, when fully utilized, can\nsignificantly enhance performance. In this work, we propose a plug-and-play\nstrategy, called Temporal Periodic Alignment (TPA), which leverages the\nperiodic nature and fine-grained temporal dependencies of gait patterns. The\nTPA strategy comprises two key components. The first component is Adaptive\nFourier-transform Position Encoding (AFPE), which adaptively converts features\nand discrete-time signals into embeddings that are sensitive to periodic\nwalking patterns. The second component is the Temporal Aggregation Module\n(TAM), which separates embeddings into trend and seasonal components, and\nextracts meaningful temporal correlations to identify primary components, while\nfiltering out random noise. We present a simple and effective baseline method\nfor gait recognition, based on the TPA strategy. Extensive experiments\nconducted on three popular public datasets (CASIA-B, OU-MVLP, and GREW)\ndemonstrate that our proposed method achieves state-of-the-art performance on\nmultiple benchmark tests.\n","authors":["Qian Wu","Ruixuan Xiao","Kaixin Xu","Jingcheng Ni","Boxun Li","Ziyao Xu"],"pdf_url":"https://arxiv.org/pdf/2307.13259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13254v1","updated":"2023-07-25T04:48:03Z","published":"2023-07-25T04:48:03Z","title":"Conditional Cross Attention Network for Multi-Space Embedding without\n  Entanglement in Only a SINGLE Network","summary":"  Many studies in vision tasks have aimed to create effective embedding spaces\nfor single-label object prediction within an image. However, in reality, most\nobjects possess multiple specific attributes, such as shape, color, and length,\nwith each attribute composed of various classes. To apply models in real-world\nscenarios, it is essential to be able to distinguish between the granular\ncomponents of an object. Conventional approaches to embedding multiple specific\nattributes into a single network often result in entanglement, where\nfine-grained features of each attribute cannot be identified separately. To\naddress this problem, we propose a Conditional Cross-Attention Network that\ninduces disentangled multi-space embeddings for various specific attributes\nwith only a single backbone. Firstly, we employ a cross-attention mechanism to\nfuse and switch the information of conditions (specific attributes), and we\ndemonstrate its effectiveness through a diverse visualization example.\nSecondly, we leverage the vision transformer for the first time to a\nfine-grained image retrieval task and present a simple yet effective framework\ncompared to existing methods. Unlike previous studies where performance varied\ndepending on the benchmark dataset, our proposed method achieved consistent\nstate-of-the-art performance on the FashionAI, DARN, DeepFashion, and Zappos50K\nbenchmark datasets.\n","authors":["Chull Hwan Song","Taebaek Hwang","Jooyoung Yoon","Shunghyun Choi","Yeong Hyeon Gu"],"pdf_url":"https://arxiv.org/pdf/2307.13254v1.pdf","comment":"ICCV 2023 Accepted"},{"id":"http://arxiv.org/abs/2307.13251v1","updated":"2023-07-25T04:43:22Z","published":"2023-07-25T04:43:22Z","title":"GaPro: Box-Supervised 3D Point Cloud Instance Segmentation Using\n  Gaussian Processes as Pseudo Labelers","summary":"  Instance segmentation on 3D point clouds (3DIS) is a longstanding challenge\nin computer vision, where state-of-the-art methods are mainly based on full\nsupervision. As annotating ground truth dense instance masks is tedious and\nexpensive, solving 3DIS with weak supervision has become more practical. In\nthis paper, we propose GaPro, a new instance segmentation for 3D point clouds\nusing axis-aligned 3D bounding box supervision. Our two-step approach involves\ngenerating pseudo labels from box annotations and training a 3DIS network with\nthe resulting labels. Additionally, we employ the self-training strategy to\nimprove the performance of our method further. We devise an effective Gaussian\nProcess to generate pseudo instance masks from the bounding boxes and resolve\nambiguities when they overlap, resulting in pseudo instance masks with their\nuncertainty values. Our experiments show that GaPro outperforms previous weakly\nsupervised 3D instance segmentation methods and has competitive performance\ncompared to state-of-the-art fully supervised ones. Furthermore, we demonstrate\nthe robustness of our approach, where we can adapt various state-of-the-art\nfully supervised methods to the weak supervision task by using our pseudo\nlabels for training. The source code and trained models are available at\nhttps://github.com/VinAIResearch/GaPro.\n","authors":["Tuan Duc Ngo","Binh-Son Hua","Khoi Nguyen"],"pdf_url":"https://arxiv.org/pdf/2307.13251v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13250v1","updated":"2023-07-25T04:41:32Z","published":"2023-07-25T04:41:32Z","title":"Keyword-Aware Relative Spatio-Temporal Graph Networks for Video Question\n  Answering","summary":"  The main challenge in video question answering (VideoQA) is to capture and\nunderstand the complex spatial and temporal relations between objects based on\ngiven questions. Existing graph-based methods for VideoQA usually ignore\nkeywords in questions and employ a simple graph to aggregate features without\nconsidering relative relations between objects, which may lead to inferior\nperformance. In this paper, we propose a Keyword-aware Relative Spatio-Temporal\n(KRST) graph network for VideoQA. First, to make question features aware of\nkeywords, we employ an attention mechanism to assign high weights to keywords\nduring question encoding. The keyword-aware question features are then used to\nguide video graph construction. Second, because relations are relative, we\nintegrate the relative relation modeling to better capture the spatio-temporal\ndynamics among object nodes. Moreover, we disentangle the spatio-temporal\nreasoning into an object-level spatial graph and a frame-level temporal graph,\nwhich reduces the impact of spatial and temporal relation reasoning on each\nother. Extensive experiments on the TGIF-QA, MSVD-QA and MSRVTT-QA datasets\ndemonstrate the superiority of our KRST over multiple state-of-the-art methods.\n","authors":["Yi Cheng","Hehe Fan","Dongyun Lin","Ying Sun","Mohan Kankanhalli","Joo-Hwee Lim"],"pdf_url":"https://arxiv.org/pdf/2307.13250v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2307.13244v1","updated":"2023-07-25T04:12:50Z","published":"2023-07-25T04:12:50Z","title":"Multi-Granularity Prediction with Learnable Fusion for Scene Text\n  Recognition","summary":"  Due to the enormous technical challenges and wide range of applications,\nscene text recognition (STR) has been an active research topic in computer\nvision for years. To tackle this tough problem, numerous innovative methods\nhave been successively proposed, and incorporating linguistic knowledge into\nSTR models has recently become a prominent trend. In this work, we first draw\ninspiration from the recent progress in Vision Transformer (ViT) to construct a\nconceptually simple yet functionally powerful vision STR model, which is built\nupon ViT and a tailored Adaptive Addressing and Aggregation (A$^3$) module. It\nalready outperforms most previous state-of-the-art models for scene text\nrecognition, including both pure vision models and language-augmented methods.\nTo integrate linguistic knowledge, we further propose a Multi-Granularity\nPrediction strategy to inject information from the language modality into the\nmodel in an implicit way, \\ie, subword representations (BPE and WordPiece)\nwidely used in NLP are introduced into the output space, in addition to the\nconventional character level representation, while no independent language\nmodel (LM) is adopted. To produce the final recognition results, two strategies\nfor effectively fusing the multi-granularity predictions are devised. The\nresultant algorithm (termed MGP-STR) is able to push the performance envelope\nof STR to an even higher level. Specifically, MGP-STR achieves an average\nrecognition accuracy of $94\\%$ on standard benchmarks for scene text\nrecognition. Moreover, it also achieves state-of-the-art results on widely-used\nhandwritten benchmarks as well as more challenging scene text datasets,\ndemonstrating the generality of the proposed MGP-STR algorithm. The source code\nand models will be available at:\n\\url{https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/OCR/MGP-STR}.\n","authors":["Cheng Da","Peng Wang","Cong Yao"],"pdf_url":"https://arxiv.org/pdf/2307.13244v1.pdf","comment":"submitted to TPAMI; an extension to our previous ECCV 2022 paper\n  arXiv:2209.03592"},{"id":"http://arxiv.org/abs/2307.13240v1","updated":"2023-07-25T04:06:25Z","published":"2023-07-25T04:06:25Z","title":"Fashion Matrix: Editing Photos by Just Talking","summary":"  The utilization of Large Language Models (LLMs) for the construction of AI\nsystems has garnered significant attention across diverse fields. The extension\nof LLMs to the domain of fashion holds substantial commercial potential but\nalso inherent challenges due to the intricate semantic interactions in\nfashion-related generation. To address this issue, we developed a hierarchical\nAI system called Fashion Matrix dedicated to editing photos by just talking.\nThis system facilitates diverse prompt-driven tasks, encompassing garment or\naccessory replacement, recoloring, addition, and removal. Specifically, Fashion\nMatrix employs LLM as its foundational support and engages in iterative\ninteractions with users. It employs a range of Semantic Segmentation Models\n(e.g., Grounded-SAM, MattingAnything, etc.) to delineate the specific editing\nmasks based on user instructions. Subsequently, Visual Foundation Models (e.g.,\nStable Diffusion, ControlNet, etc.) are leveraged to generate edited images\nfrom text prompts and masks, thereby facilitating the automation of fashion\nediting processes. Experiments demonstrate the outstanding ability of Fashion\nMatrix to explores the collaborative potential of functionally diverse\npre-trained models in the domain of fashion editing.\n","authors":["Zheng Chong","Xujie Zhang","Fuwei Zhao","Zhenyu Xie","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2307.13240v1.pdf","comment":"13 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.12907v2","updated":"2023-07-25T04:05:58Z","published":"2023-07-24T16:02:42Z","title":"GridMM: Grid Memory Map for Vision-and-Language Navigation","summary":"  Vision-and-language navigation (VLN) enables the agent to navigate to a\nremote location following the natural language instruction in 3D environments.\nTo represent the previously visited environment, most approaches for VLN\nimplement memory using recurrent states, topological maps, or top-down semantic\nmaps. In contrast to these approaches, we build the top-down egocentric and\ndynamically growing Grid Memory Map (i.e., GridMM) to structure the visited\nenvironment. From a global perspective, historical observations are projected\ninto a unified grid map in a top-down view, which can better represent the\nspatial relations of the environment. From a local perspective, we further\npropose an instruction relevance aggregation method to capture fine-grained\nvisual clues in each grid region. Extensive experiments are conducted on both\nthe REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE\ndataset in the continuous environments, showing the superiority of our proposed\nmethod.\n","authors":["Zihan Wang","Xiangyang Li","Jiahao Yang","Yeqi Liu","Shuqiang Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.12907v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13236v1","updated":"2023-07-25T03:59:04Z","published":"2023-07-25T03:59:04Z","title":"Audio-aware Query-enhanced Transformer for Audio-Visual Segmentation","summary":"  The goal of the audio-visual segmentation (AVS) task is to segment the\nsounding objects in the video frames using audio cues. However, current\nfusion-based methods have the performance limitations due to the small\nreceptive field of convolution and inadequate fusion of audio-visual features.\nTo overcome these issues, we propose a novel \\textbf{Au}dio-aware\nquery-enhanced \\textbf{TR}ansformer (AuTR) to tackle the task. Unlike existing\nmethods, our approach introduces a multimodal transformer architecture that\nenables deep fusion and aggregation of audio-visual features. Furthermore, we\ndevise an audio-aware query-enhanced transformer decoder that explicitly helps\nthe model focus on the segmentation of the pinpointed sounding objects based on\naudio signals, while disregarding silent yet salient objects. Experimental\nresults show that our method outperforms previous methods and demonstrates\nbetter generalization ability in multi-sound and open-set scenarios.\n","authors":["Jinxiang Liu","Chen Ju","Chaofan Ma","Yanfeng Wang","Yu Wang","Ya Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.13236v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2305.11019"},{"id":"http://arxiv.org/abs/2307.13226v1","updated":"2023-07-25T03:30:09Z","published":"2023-07-25T03:30:09Z","title":"Strivec: Sparse Tri-Vector Radiance Fields","summary":"  We propose Strivec, a novel neural representation that models a 3D scene as a\nradiance field with sparsely distributed and compactly factorized local tensor\nfeature grids. Our approach leverages tensor decomposition, following the\nrecent work TensoRF, to model the tensor grids. In contrast to TensoRF which\nuses a global tensor and focuses on their vector-matrix decomposition, we\npropose to utilize a cloud of local tensors and apply the classic\nCANDECOMP/PARAFAC (CP) decomposition to factorize each tensor into triple\nvectors that express local feature distributions along spatial axes and\ncompactly encode a local neural field. We also apply multi-scale tensor grids\nto discover the geometry and appearance commonalities and exploit spatial\ncoherence with the tri-vector factorization at multiple local scales. The final\nradiance field properties are regressed by aggregating neural features from\nmultiple local tensors across all scales. Our tri-vector tensors are sparsely\ndistributed around the actual scene surface, discovered by a fast coarse\nreconstruction, leveraging the sparsity of a 3D scene. We demonstrate that our\nmodel can achieve better rendering quality while using significantly fewer\nparameters than previous methods, including TensoRF and Instant-NGP.\n","authors":["Quankai Gao","Qiangeng Xu","Hao Su","Ulrich Neumann","Zexiang Xu"],"pdf_url":"https://arxiv.org/pdf/2307.13226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13221v1","updated":"2023-07-25T03:18:04Z","published":"2023-07-25T03:18:04Z","title":"Multilevel Large Language Models for Everyone","summary":"  Large language models have made significant progress in the past few years.\nHowever, they are either generic {\\it or} field specific, splitting the\ncommunity into different groups. In this paper, we unify these large language\nmodels into a larger map, where the generic {\\it and} specific models are\nlinked together and can improve each other, based on the user personal input\nand information from the internet. The idea of linking several large language\nmodels together is inspired by the functionality of human brain. The specific\nregions on the brain cortex are specific for certain low level functionality.\nAnd these regions can jointly work together to achieve more complex high level\nfunctionality. Such behavior on human brain cortex sheds the light to design\nthe multilevel large language models that contain global level, field level and\nuser level models. The user level models run on local machines to achieve\nefficient response and protect the user's privacy. Such multilevel models\nreduce some redundancy and perform better than the single level models. The\nproposed multilevel idea can be applied in various applications, such as\nnatural language processing, computer vision tasks, professional assistant,\nbusiness and healthcare.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2307.13221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13215v1","updated":"2023-07-25T02:56:20Z","published":"2023-07-25T02:56:20Z","title":"Image Segmentation Keras : Implementation of Segnet, FCN, UNet, PSPNet\n  and other models in Keras","summary":"  Semantic segmentation plays a vital role in computer vision tasks, enabling\nprecise pixel-level understanding of images. In this paper, we present a\ncomprehensive library for semantic segmentation, which contains implementations\nof popular segmentation models like SegNet, FCN, UNet, and PSPNet. We also\nevaluate and compare these models on several datasets, offering researchers and\npractitioners a powerful toolset for tackling diverse segmentation challenges.\n","authors":["Divam Gupta"],"pdf_url":"https://arxiv.org/pdf/2307.13215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04264v3","updated":"2023-07-25T02:45:53Z","published":"2023-02-08T18:58:00Z","title":"Nerfstudio: A Modular Framework for Neural Radiance Field Development","summary":"  Neural Radiance Fields (NeRF) are a rapidly growing area of research with\nwide-ranging applications in computer vision, graphics, robotics, and more. In\norder to streamline the development and deployment of NeRF research, we propose\na modular PyTorch framework, Nerfstudio. Our framework includes plug-and-play\ncomponents for implementing NeRF-based methods, which make it easy for\nresearchers and practitioners to incorporate NeRF into their projects.\nAdditionally, the modular design enables support for extensive real-time\nvisualization tools, streamlined pipelines for importing captured in-the-wild\ndata, and tools for exporting to video, point cloud and mesh representations.\nThe modularity of Nerfstudio enables the development of Nerfacto, our method\nthat combines components from recent papers to achieve a balance between speed\nand quality, while also remaining flexible to future modifications. To promote\ncommunity-driven development, all associated code and data are made publicly\navailable with open-source licensing at https://nerf.studio.\n","authors":["Matthew Tancik","Ethan Weber","Evonne Ng","Ruilong Li","Brent Yi","Justin Kerr","Terrance Wang","Alexander Kristoffersen","Jake Austin","Kamyar Salahi","Abhik Ahuja","David McAllister","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2302.04264v3.pdf","comment":"Project page at https://nerf.studio"},{"id":"http://arxiv.org/abs/2307.11514v2","updated":"2023-07-25T02:44:55Z","published":"2023-07-21T11:50:05Z","title":"CORE: Cooperative Reconstruction for Multi-Agent Perception","summary":"  This paper presents CORE, a conceptually simple, effective and\ncommunication-efficient model for multi-agent cooperative perception. It\naddresses the task from a novel perspective of cooperative reconstruction,\nbased on two key insights: 1) cooperating agents together provide a more\nholistic observation of the environment, and 2) the holistic observation can\nserve as valuable supervision to explicitly guide the model learning how to\nreconstruct the ideal observation based on collaboration. CORE instantiates the\nidea with three major components: a compressor for each agent to create more\ncompact feature representation for efficient broadcasting, a lightweight\nattentive collaboration component for cross-agent message aggregation, and a\nreconstruction module to reconstruct the observation based on aggregated\nfeature representations. This learning-to-reconstruct idea is task-agnostic,\nand offers clear and reasonable supervision to inspire more effective\ncollaboration, eventually promoting perception tasks. We validate CORE on\nOPV2V, a large-scale multi-agent percetion dataset, in two tasks, i.e., 3D\nobject detection and semantic segmentation. Results demonstrate that the model\nachieves state-of-the-art performance on both tasks, and is more\ncommunication-efficient.\n","authors":["Binglu Wang","Lei Zhang","Zhaozhong Wang","Yongqiang Zhao","Tianfei Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.11514v2.pdf","comment":"Accepted to ICCV 2023; Code: https://github.com/zllxot/CORE"},{"id":"http://arxiv.org/abs/2302.13434v2","updated":"2023-07-25T02:24:04Z","published":"2023-02-26T23:02:33Z","title":"Spatial-temporal Transformer-guided Diffusion based Data Augmentation\n  for Efficient Skeleton-based Action Recognition","summary":"  Recently, skeleton-based human action has become a hot research topic because\nthe compact representation of human skeletons brings new blood to this research\ndomain. As a result, researchers began to notice the importance of using RGB or\nother sensors to analyze human action by extracting skeleton information.\nLeveraging the rapid development of deep learning (DL), a significant number of\nskeleton-based human action approaches have been presented with fine-designed\nDL structures recently. However, a well-trained DL model always demands\nhigh-quality and sufficient data, which is hard to obtain without costing high\nexpenses and human labor. In this paper, we introduce a novel data augmentation\nmethod for skeleton-based action recognition tasks, which can effectively\ngenerate high-quality and diverse sequential actions. In order to obtain\nnatural and realistic action sequences, we propose denoising diffusion\nprobabilistic models (DDPMs) that can generate a series of synthetic action\nsequences, and their generation process is precisely guided by a\nspatial-temporal transformer (ST-Trans). Experimental results show that our\nmethod outperforms the state-of-the-art (SOTA) motion generation approaches on\ndifferent naturality and diversity metrics. It proves that its high-quality\nsynthetic data can also be effectively deployed to existing action recognition\nmodels with significant performance improvement.\n","authors":["Yifan Jiang","Han Chen","Hanseok Ko"],"pdf_url":"https://arxiv.org/pdf/2302.13434v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.06524v2","updated":"2023-07-25T02:22:16Z","published":"2022-12-13T12:17:13Z","title":"SST: Real-time End-to-end Monocular 3D Reconstruction via Sparse\n  Spatial-Temporal Guidance","summary":"  Real-time monocular 3D reconstruction is a challenging problem that remains\nunsolved. Although recent end-to-end methods have demonstrated promising\nresults, tiny structures and geometric boundaries are hardly captured due to\ntheir insufficient supervision neglecting spatial details and oversimplified\nfeature fusion ignoring temporal cues. To address the problems, we propose an\nend-to-end 3D reconstruction network SST, which utilizes Sparse estimated\npoints from visual SLAM system as additional Spatial guidance and fuses\nTemporal features via a novel cross-modal attention mechanism, achieving more\ndetailed reconstruction results. We propose a Local Spatial-Temporal Fusion\nmodule to exploit more informative spatial-temporal cues from multi-view color\ninformation and sparse priors, as well a Global Spatial-Temporal Fusion module\nto refine the local TSDF volumes with the world-frame model from coarse to\nfine. Extensive experiments on ScanNet and 7-Scenes demonstrate that SST\noutperforms all state-of-the-art competitors, whilst keeping a high inference\nspeed at 59 FPS, enabling real-world applications with real-time requirements.\n","authors":["Chenyangguang Zhang","Zhiqiang Lou","Yan Di","Federico Tombari","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2212.06524v2.pdf","comment":"ICME 2023 (oral)"},{"id":"http://arxiv.org/abs/2303.17743v2","updated":"2023-07-25T01:50:39Z","published":"2023-03-30T23:30:42Z","title":"FairGen: Towards Fair Graph Generation","summary":"  There have been tremendous efforts over the past decades dedicated to the\ngeneration of realistic graphs in a variety of domains, ranging from social\nnetworks to computer networks, from gene regulatory networks to online\ntransaction networks. Despite the remarkable success, the vast majority of\nthese works are unsupervised in nature and are typically trained to minimize\nthe expected graph reconstruction loss, which would result in the\nrepresentation disparity issue in the generated graphs, i.e., the protected\ngroups (often minorities) contribute less to the objective and thus suffer from\nsystematically higher errors. In this paper, we aim to tailor graph generation\nto downstream mining tasks by leveraging label information and user-preferred\nparity constraint. In particular, we start from the investigation of\nrepresentation disparity in the context of graph generative models. To mitigate\nthe disparity, we propose a fairness-aware graph generative model named\nFairGen. Our model jointly trains a label-informed graph generation module and\na fair representation learning module by progressively learning the behaviors\nof the protected and unprotected groups, from the `easy' concepts to the `hard'\nones. In addition, we propose a generic context sampling strategy for graph\ngenerative models, which is proven to be capable of fairly capturing the\ncontextual information of each group with a high probability. Experimental\nresults on seven real-world data sets, including web-based graphs, demonstrate\nthat FairGen (1) obtains performance on par with state-of-the-art graph\ngenerative models across six network properties, (2) mitigates the\nrepresentation disparity issues in the generated graphs, and (3) substantially\nboosts the model performance by up to 17% in downstream tasks via data\naugmentation.\n","authors":["Lecheng Zheng","Dawei Zhou","Hanghang Tong","Jiejun Xu","Yada Zhu","Jingrui He"],"pdf_url":"https://arxiv.org/pdf/2303.17743v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13199v1","updated":"2023-07-25T01:35:37Z","published":"2023-07-25T01:35:37Z","title":"An Investigation into Glomeruli Detection in Kidney H&E and PAS Images\n  using YOLO","summary":"  Context: Analyzing digital pathology images is necessary to draw diagnostic\nconclusions by investigating tissue patterns and cellular morphology. However,\nmanual evaluation can be time-consuming, expensive, and prone to inter- and\nintra-observer variability. Objective: To assist pathologists using\ncomputerized solutions, automated tissue structure detection and segmentation\nmust be proposed. Furthermore, generating pixel-level object annotations for\nhistopathology images is expensive and time-consuming. As a result, detection\nmodels with bounding box labels may be a feasible solution. Design: This paper\nstudies. YOLO-v4 (You-Only-Look-Once), a real-time object detector for\nmicroscopic images. YOLO uses a single neural network to predict several\nbounding boxes and class probabilities for objects of interest. YOLO can\nenhance detection performance by training on whole slide images. YOLO-v4 has\nbeen used in this paper. for glomeruli detection in human kidney images.\nMultiple experiments have been designed and conducted based on different\ntraining data of two public datasets and a private dataset from the University\nof Michigan for fine-tuning the model. The model was tested on the private\ndataset from the University of Michigan, serving as an external validation of\ntwo different stains, namely hematoxylin and eosin (H&E) and periodic\nacid-Schiff (PAS). Results: Average specificity and sensitivity for all\nexperiments, and comparison of existing segmentation methods on the same\ndatasets are discussed. Conclusions: Automated glomeruli detection in human\nkidney images is possible using modern AI models. The design and validation for\ndifferent stains still depends on variability of public multi-stain datasets.\n","authors":["Kimia Hemmatirad","Morteza Babaie","Jeffrey Hodgin","Liron Pantanowitz","H. R. Tizhoosh"],"pdf_url":"https://arxiv.org/pdf/2307.13199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02034v2","updated":"2023-07-25T00:19:04Z","published":"2023-03-03T15:52:06Z","title":"Linear CNNs Discover the Statistical Structure of the Dataset Using Only\n  the Most Dominant Frequencies","summary":"  We here present a stepping stone towards a deeper understanding of\nconvolutional neural networks (CNNs) in the form of a theory of learning in\nlinear CNNs. Through analyzing the gradient descent equations, we discover that\nthe evolution of the network during training is determined by the interplay\nbetween the dataset structure and the convolutional network structure. We show\nthat linear CNNs discover the statistical structure of the dataset with\nnon-linear, ordered, stage-like transitions, and that the speed of discovery\nchanges depending on the relationship between the dataset and the convolutional\nnetwork structure. Moreover, we find that this interplay lies at the heart of\nwhat we call the ``dominant frequency bias'', where linear CNNs arrive at these\ndiscoveries using only the dominant frequencies of the different structural\nparts present in the dataset. We furthermore provide experiments that show how\nour theory relates to deep, non-linear CNNs used in practice. Our findings shed\nnew light on the inner working of CNNs, and can help explain their shortcut\nlearning and their tendency to rely on texture instead of shape.\n","authors":["Hannah Pinson","Joeri Lenaerts","Vincent Ginis"],"pdf_url":"https://arxiv.org/pdf/2303.02034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13865v1","updated":"2023-07-25T23:46:48Z","published":"2023-07-25T23:46:48Z","title":"Pretrained Deep 2.5D Models for Efficient Predictive Modeling from\n  Retinal OCT","summary":"  In the field of medical imaging, 3D deep learning models play a crucial role\nin building powerful predictive models of disease progression. However, the\nsize of these models presents significant challenges, both in terms of\ncomputational resources and data requirements. Moreover, achieving high-quality\npretraining of 3D models proves to be even more challenging. To address these\nissues, hybrid 2.5D approaches provide an effective solution for utilizing 3D\nvolumetric data efficiently using 2D models. Combining 2D and 3D techniques\noffers a promising avenue for optimizing performance while minimizing memory\nrequirements. In this paper, we explore 2.5D architectures based on a\ncombination of convolutional neural networks (CNNs), long short-term memory\n(LSTM), and Transformers. In addition, leveraging the benefits of recent\nnon-contrastive pretraining approaches in 2D, we enhanced the performance and\ndata efficiency of 2.5D techniques even further. We demonstrate the\neffectiveness of architectures and associated pretraining on a task of\npredicting progression to wet age-related macular degeneration (AMD) within a\nsix-month period on two large longitudinal OCT datasets.\n","authors":["Taha Emre","Marzieh Oghbaie","Arunava Chakravarty","Antoine Rivail","Sophie Riedl","Julia Mai","Hendrik P. N. Scholl","Sobha Sivaprasad","Daniel Rueckert","Andrew Lotery","Ursula Schmidt-Erfurth","Hrvoje Bogunović"],"pdf_url":"https://arxiv.org/pdf/2307.13865v1.pdf","comment":"Accepted at OMIA-X MICCAI'23 Workshop"},{"id":"http://arxiv.org/abs/2305.11990v2","updated":"2023-07-25T23:43:35Z","published":"2023-05-19T20:30:59Z","title":"Productive Crop Field Detection: A New Dataset and Deep Learning\n  Benchmark Results","summary":"  In precision agriculture, detecting productive crop fields is an essential\npractice that allows the farmer to evaluate operating performance separately\nand compare different seed varieties, pesticides, and fertilizers. However,\nmanually identifying productive fields is often a time-consuming and\nerror-prone task. Previous studies explore different methods to detect crop\nfields using advanced machine learning algorithms, but they often lack good\nquality labeled data. In this context, we propose a high-quality dataset\ngenerated by machine operation combined with Sentinel-2 images tracked over\ntime. As far as we know, it is the first one to overcome the lack of labeled\nsamples by using this technique. In sequence, we apply a semi-supervised\nclassification of unlabeled data and state-of-the-art supervised and\nself-supervised deep learning methods to detect productive crop fields\nautomatically. Finally, the results demonstrate high accuracy in Positive\nUnlabeled learning, which perfectly fits the problem where we have high\nconfidence in the positive samples. Best performances have been found in\nTriplet Loss Siamese given the existence of an accurate dataset and Contrastive\nLearning considering situations where we do not have a comprehensive labeled\ndataset available.\n","authors":["Eduardo Nascimento","John Just","Jurandy Almeida","Tiago Almeida"],"pdf_url":"https://arxiv.org/pdf/2305.11990v2.pdf","comment":"Preprint of the paper https://doi.org/10.1109/lgrs.2023.3296064\n  published in IEEE Geoscience and Remote Sensing Letters"},{"id":"http://arxiv.org/abs/2307.13856v1","updated":"2023-07-25T23:09:05Z","published":"2023-07-25T23:09:05Z","title":"On the unreasonable vulnerability of transformers for image restoration\n  -- and an easy fix","summary":"  Following their success in visual recognition tasks, Vision\nTransformers(ViTs) are being increasingly employed for image restoration. As a\nfew recent works claim that ViTs for image classification also have better\nrobustness properties, we investigate whether the improved adversarial\nrobustness of ViTs extends to image restoration. We consider the recently\nproposed Restormer model, as well as NAFNet and the \"Baseline network\" which\nare both simplified versions of a Restormer. We use Projected Gradient Descent\n(PGD) and CosPGD, a recently proposed adversarial attack tailored to pixel-wise\nprediction tasks for our robustness evaluation. Our experiments are performed\non real-world images from the GoPro dataset for image deblurring. Our analysis\nindicates that contrary to as advocated by ViTs in image classification works,\nthese models are highly susceptible to adversarial attacks. We attempt to\nimprove their robustness through adversarial training. While this yields a\nsignificant increase in robustness for Restormer, results on other networks are\nless promising. Interestingly, the design choices in NAFNet and Baselines,\nwhich were based on iid performance, and not on robust generalization, seem to\nbe at odds with the model robustness. Thus, we investigate this further and\nfind a fix.\n","authors":["Shashank Agnihotri","Kanchana Vaishnavi Gandikota","Julia Grabinski","Paramanand Chandramouli","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2307.13856v1.pdf","comment":"Tags: Robustness, adversarial attacks, image deblurring, image\n  restoration, NAFNet, Baseline, Restormer, adversarial training"},{"id":"http://arxiv.org/abs/2307.13855v1","updated":"2023-07-25T23:02:35Z","published":"2023-07-25T23:02:35Z","title":"Exploring the Sharpened Cosine Similarity","summary":"  Convolutional layers have long served as the primary workhorse for image\nclassification. Recently, an alternative to convolution was proposed using the\nSharpened Cosine Similarity (SCS), which in theory may serve as a better\nfeature detector. While multiple sources report promising results, there has\nnot been to date a full-scale empirical analysis of neural network performance\nusing these new layers. In our work, we explore SCS's parameter behavior and\npotential as a drop-in replacement for convolutions in multiple CNN\narchitectures benchmarked on CIFAR-10. We find that while SCS may not yield\nsignificant increases in accuracy, it may learn more interpretable\nrepresentations. We also find that, in some circumstances, SCS may confer a\nslight increase in adversarial robustness.\n","authors":["Skyler Wu","Fred Lu","Edward Raff","James Holt"],"pdf_url":"https://arxiv.org/pdf/2307.13855v1.pdf","comment":"Accepted to I Can't Believe It's Not Better Workshop (ICBINB) at\n  NeurIPS 2022"},{"id":"http://arxiv.org/abs/2307.13851v1","updated":"2023-07-25T22:54:47Z","published":"2023-07-25T22:54:47Z","title":"SplitFed resilience to packet loss: Where to split, that is the question","summary":"  Decentralized machine learning has broadened its scope recently with the\ninvention of Federated Learning (FL), Split Learning (SL), and their hybrids\nlike Split Federated Learning (SplitFed or SFL). The goal of SFL is to reduce\nthe computational power required by each client in FL and parallelize SL while\nmaintaining privacy. This paper investigates the robustness of SFL against\npacket loss on communication links. The performance of various SFL aggregation\nstrategies is examined by splitting the model at two points -- shallow split\nand deep split -- and testing whether the split point makes a statistically\nsignificant difference to the accuracy of the final model. Experiments are\ncarried out on a segmentation model for human embryo images and indicate the\nstatistically significant advantage of a deeper split point.\n","authors":["Chamani Shiranthika","Zahra Hafezi Kafshgari","Parvaneh Saeedi","Ivan V. Bajić"],"pdf_url":"https://arxiv.org/pdf/2307.13851v1.pdf","comment":"10 pages, 4 figures, MICCAI 2023 Workshop on Distributed,\n  Collaborative and Federated Learning"},{"id":"http://arxiv.org/abs/2307.13850v1","updated":"2023-07-25T22:51:36Z","published":"2023-07-25T22:51:36Z","title":"MAEA: Multimodal Attribution for Embodied AI","summary":"  Understanding multimodal perception for embodied AI is an open question\nbecause such inputs may contain highly complementary as well as redundant\ninformation for the task. A relevant direction for multimodal policies is\nunderstanding the global trends of each modality at the fusion layer. To this\nend, we disentangle the attributions for visual, language, and previous action\ninputs across different policies trained on the ALFRED dataset. Attribution\nanalysis can be utilized to rank and group the failure scenarios, investigate\nmodeling and dataset biases, and critically analyze multimodal EAI policies for\nrobustness and user trust before deployment. We present MAEA, a framework to\ncompute global attributions per modality of any differentiable policy. In\naddition, we show how attributions enable lower-level behavior analysis in EAI\npolicies for language and visual attributions.\n","authors":["Vidhi Jain","Jayant Sravan Tamarapalli","Sahiti Yerramilli","Yonatan Bisk"],"pdf_url":"https://arxiv.org/pdf/2307.13850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08544v4","updated":"2023-07-25T22:42:36Z","published":"2022-11-12T06:11:36Z","title":"Exploiting the Partly Scratch-off Lottery Ticket for Quantization-Aware\n  Training","summary":"  Quantization-aware training (QAT) receives extensive popularity as it well\nretains the performance of quantized networks. In QAT, the contemporary\nexperience is that all quantized weights are updated for an entire training\nprocess. In this paper, this experience is challenged based on an interesting\nphenomenon we observed. Specifically, a large portion of quantized weights\nreaches the optimal quantization level after a few training epochs, which we\nrefer to as the partly scratch-off lottery ticket. This\nstraightforward-yet-valuable observation naturally inspires us to zero out\ngradient calculations of these weights in the remaining training period to\navoid meaningless updating. To effectively find the ticket, we develop a\nheuristic method, dubbed lottery ticket scratcher (LTS), which freezes a weight\nonce the distance between the full-precision one and its quantization level is\nsmaller than a controllable threshold. Surprisingly, the proposed LTS typically\neliminates 50%-70% weight updating and 25%-35% FLOPs of the backward pass,\nwhile still resulting on par with or even better performance than the compared\nbaseline. For example, compared with the baseline, LTS improves 2-bit\nMobileNetV2 by 5.05%, eliminating 46% weight updating and 23% FLOPs of the\nbackward pass. Code is at url{https://github.com/zysxmu/LTS}.\n","authors":["Yunshan Zhong","Gongrui Nan","Yuxin Zhang","Fei Chao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2211.08544v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13842v1","updated":"2023-07-25T22:37:10Z","published":"2023-07-25T22:37:10Z","title":"CosSIF: Cosine similarity-based image filtering to overcome low\n  inter-class variation in synthetic medical image datasets","summary":"  Crafting effective deep learning models for medical image analysis is a\ncomplex task, particularly in cases where the medical image dataset lacks\nsignificant inter-class variation. This challenge is further aggravated when\nemploying such datasets to generate synthetic images using generative\nadversarial networks (GANs), as the output of GANs heavily relies on the input\ndata. In this research, we propose a novel filtering algorithm called Cosine\nSimilarity-based Image Filtering (CosSIF). We leverage CosSIF to develop two\ndistinct filtering methods: Filtering Before GAN Training (FBGT) and Filtering\nAfter GAN Training (FAGT). FBGT involves the removal of real images that\nexhibit similarities to images of other classes before utilizing them as the\ntraining dataset for a GAN. On the other hand, FAGT focuses on eliminating\nsynthetic images with less discriminative features compared to real images used\nfor training the GAN. Experimental results reveal that employing either the\nFAGT or FBGT method with modern transformer and convolutional-based networks\nleads to substantial performance gains in various evaluation metrics. FAGT\nimplementation on the ISIC-2016 dataset surpasses the baseline method in terms\nof sensitivity by 1.59\\% and AUC by 1.88\\%. Furthermore, for the HAM10000\ndataset, applying FABT outperforms the baseline approach in terms of recall by\n13.75\\%, and with the sole implementation of FAGT, achieves a maximum accuracy\nof 94.44\\%.\n","authors":["Mominul Islam","Hasib Zunair","Nabeel Mohammed"],"pdf_url":"https://arxiv.org/pdf/2307.13842v1.pdf","comment":"18 pages, 20 figures"},{"id":"http://arxiv.org/abs/2303.06040v2","updated":"2023-07-25T20:53:03Z","published":"2023-03-10T16:30:09Z","title":"Importance of Aligning Training Strategy with Evaluation for Diffusion\n  Models in 3D Multiclass Segmentation","summary":"  Recently, denoising diffusion probabilistic models (DDPM) have been applied\nto image segmentation by generating segmentation masks conditioned on images,\nwhile the applications were mainly limited to 2D networks without exploiting\npotential benefits from the 3D formulation. In this work, we studied the\nDDPM-based segmentation model for 3D multiclass segmentation on two large\nmulticlass data sets (prostate MR and abdominal CT). We observed that the\ndifference between training and test methods led to inferior performance for\nexisting DDPM methods. To mitigate the inconsistency, we proposed a recycling\nmethod which generated corrupted masks based on the model's prediction at a\nprevious time step instead of using ground truth. The proposed method achieved\nstatistically significantly improved performance compared to existing DDPMs,\nindependent of a number of other techniques for reducing train-test\ndiscrepancy, including performing mask prediction, using Dice loss, and\nreducing the number of diffusion time steps during training. The performance of\ndiffusion models was also competitive and visually similar to\nnon-diffusion-based U-net, within the same compute budget. The JAX-based\ndiffusion framework has been released at\nhttps://github.com/mathpluscode/ImgX-DiffSeg.\n","authors":["Yunguan Fu","Yiwen Li","Shaheer U. Saeed","Matthew J. Clarkson","Yipeng Hu"],"pdf_url":"https://arxiv.org/pdf/2303.06040v2.pdf","comment":"Accepted at Deep Generative Models workshop at MICCAI 2023"},{"id":"http://arxiv.org/abs/2302.12247v2","updated":"2023-07-25T20:50:10Z","published":"2023-02-23T18:59:05Z","title":"Quantifying & Modeling Multimodal Interactions: An Information\n  Decomposition Framework","summary":"  The recent explosion of interest in multimodal applications has resulted in a\nwide selection of datasets and methods for representing and integrating\ninformation from different modalities. Despite these empirical advances, there\nremain fundamental research questions: How can we quantify the interactions\nthat are necessary to solve a multimodal task? Subsequently, what are the most\nsuitable multimodal models to capture these interactions? To answer these\nquestions, we propose an information-theoretic approach to quantify the degree\nof redundancy, uniqueness, and synergy relating input modalities with an output\ntask. We term these three measures as the PID statistics of a multimodal\ndistribution (or PID for short), and introduce two new estimators for these PID\nstatistics that scale to high-dimensional distributions. To validate PID\nestimation, we conduct extensive experiments on both synthetic datasets where\nthe PID is known and on large-scale multimodal benchmarks where PID estimations\nare compared with human annotations. Finally, we demonstrate their usefulness\nin (1) quantifying interactions within multimodal datasets, (2) quantifying\ninteractions captured by multimodal models, (3) principled approaches for model\nselection, and (4) three real-world case studies engaging with domain experts\nin pathology, mood prediction, and robotic perception where our framework helps\nto recommend strong multimodal models for each application.\n","authors":["Paul Pu Liang","Yun Cheng","Xiang Fan","Chun Kai Ling","Suzanne Nie","Richard Chen","Zihao Deng","Nicholas Allen","Randy Auerbach","Faisal Mahmood","Ruslan Salakhutdinov","Louis-Philippe Morency"],"pdf_url":"https://arxiv.org/pdf/2302.12247v2.pdf","comment":"Code available at: https://github.com/pliang279/PID"},{"id":"http://arxiv.org/abs/2303.04068v3","updated":"2023-07-25T20:09:55Z","published":"2023-03-07T17:26:04Z","title":"VOCALExplore: Pay-as-You-Go Video Data Exploration and Model Building\n  [Technical Report]","summary":"  We introduce VOCALExplore, a system designed to support users in building\ndomain-specific models over video datasets. VOCALExplore supports interactive\nlabeling sessions and trains models using user-supplied labels. VOCALExplore\nmaximizes model quality by automatically deciding how to select samples based\non observed skew in the collected labels. It also selects the optimal video\nrepresentations to use when training models by casting feature selection as a\nrising bandit problem. Finally, VOCALExplore implements optimizations to\nachieve low latency without sacrificing model performance. We demonstrate that\nVOCALExplore achieves close to the best possible model quality given candidate\nacquisition functions and feature extractors, and it does so with low visible\nlatency (~1 second per iteration) and no expensive preprocessing.\n","authors":["Maureen Daum","Enhao Zhang","Dong He","Stephen Mussmann","Brandon Haynes","Ranjay Krishna","Magdalena Balazinska"],"pdf_url":"https://arxiv.org/pdf/2303.04068v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15446v2","updated":"2023-07-25T19:56:00Z","published":"2023-03-27T17:59:58Z","title":"SwiftFormer: Efficient Additive Attention for Transformer-based\n  Real-time Mobile Vision Applications","summary":"  Self-attention has become a defacto choice for capturing global context in\nvarious vision applications. However, its quadratic computational complexity\nwith respect to image resolution limits its use in real-time applications,\nespecially for deployment on resource-constrained mobile devices. Although\nhybrid approaches have been proposed to combine the advantages of convolutions\nand self-attention for a better speed-accuracy trade-off, the expensive matrix\nmultiplication operations in self-attention remain a bottleneck. In this work,\nwe introduce a novel efficient additive attention mechanism that effectively\nreplaces the quadratic matrix multiplication operations with linear\nelement-wise multiplications. Our design shows that the key-value interaction\ncan be replaced with a linear layer without sacrificing any accuracy. Unlike\nprevious state-of-the-art methods, our efficient formulation of self-attention\nenables its usage at all stages of the network. Using our proposed efficient\nadditive attention, we build a series of models called \"SwiftFormer\" which\nachieves state-of-the-art performance in terms of both accuracy and mobile\ninference speed. Our small variant achieves 78.5% top-1 ImageNet-1K accuracy\nwith only 0.8 ms latency on iPhone 14, which is more accurate and 2x faster\ncompared to MobileViT-v2. Code: https://github.com/Amshaker/SwiftFormer\n","authors":["Abdelrahman Shaker","Muhammad Maaz","Hanoona Rasheed","Salman Khan","Ming-Hsuan Yang","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2303.15446v2.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13770v1","updated":"2023-07-25T19:03:21Z","published":"2023-07-25T19:03:21Z","title":"E^2VPT: An Effective and Efficient Approach for Visual Prompt Tuning","summary":"  As the size of transformer-based models continues to grow, fine-tuning these\nlarge-scale pretrained vision models for new tasks has become increasingly\nparameter-intensive. Parameter-efficient learning has been developed to reduce\nthe number of tunable parameters during fine-tuning. Although these methods\nshow promising results, there is still a significant performance gap compared\nto full fine-tuning. To address this challenge, we propose an Effective and\nEfficient Visual Prompt Tuning (E^2VPT) approach for large-scale\ntransformer-based model adaptation. Specifically, we introduce a set of\nlearnable key-value prompts and visual prompts into self-attention and input\nlayers, respectively, to improve the effectiveness of model fine-tuning.\nMoreover, we design a prompt pruning procedure to systematically prune low\nimportance prompts while preserving model performance, which largely enhances\nthe model's efficiency. Empirical results demonstrate that our approach\noutperforms several state-of-the-art baselines on two benchmarks, with\nconsiderably low parameter usage (e.g., 0.32% of model parameters on VTAB-1k).\nOur code is available at https://github.com/ChengHan111/E2VPT.\n","authors":["Cheng Han","Qifan Wang","Yiming Cui","Zhiwen Cao","Wenguan Wang","Siyuan Qi","Dongfang Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13770v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.13765v1","updated":"2023-07-25T18:50:05Z","published":"2023-07-25T18:50:05Z","title":"A real-time material breakage detection for offshore wind turbines based\n  on improved neural network algorithm","summary":"  The integrity of offshore wind turbines, pivotal for sustainable energy\ngeneration, is often compromised by surface material defects. Despite the\navailability of various detection techniques, limitations persist regarding\ncost-effectiveness, efficiency, and applicability. Addressing these\nshortcomings, this study introduces a novel approach leveraging an advanced\nversion of the YOLOv8 object detection model, supplemented with a Convolutional\nBlock Attention Module (CBAM) for improved feature recognition. The optimized\nloss function further refines the learning process. Employing a dataset of\n5,432 images from the Saemangeum offshore wind farm and a publicly available\ndataset, our method underwent rigorous testing. The findings reveal a\nsubstantial enhancement in defect detection stability, marking a significant\nstride towards efficient turbine maintenance. This study's contributions\nilluminate the path for future research, potentially revolutionizing\nsustainable energy practices.\n","authors":["Yantong Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13765v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2306.16019"},{"id":"http://arxiv.org/abs/2307.13762v1","updated":"2023-07-25T18:43:08Z","published":"2023-07-25T18:43:08Z","title":"Implementing and Benchmarking the Locally Competitive Algorithm on the\n  Loihi 2 Neuromorphic Processor","summary":"  Neuromorphic processors have garnered considerable interest in recent years\nfor their potential in energy-efficient and high-speed computing. The Locally\nCompetitive Algorithm (LCA) has been utilized for power efficient sparse coding\non neuromorphic processors, including the first Loihi processor. With the Loihi\n2 processor enabling custom neuron models and graded spike communication, more\ncomplex implementations of LCA are possible. We present a new implementation of\nLCA designed for the Loihi 2 processor and perform an initial set of benchmarks\ncomparing it to LCA on CPU and GPU devices. In these experiments LCA on Loihi 2\nis orders of magnitude more efficient and faster for large sparsity penalties,\nwhile maintaining similar reconstruction quality. We find this performance\nimprovement increases as the LCA parameters are tuned towards greater\nrepresentation sparsity.\n  Our study highlights the potential of neuromorphic processors, particularly\nLoihi 2, in enabling intelligent, autonomous, real-time processing on small\nrobots, satellites where there are strict SWaP (small, lightweight, and low\npower) requirements. By demonstrating the superior performance of LCA on Loihi\n2 compared to conventional computing device, our study suggests that Loihi 2\ncould be a valuable tool in advancing these types of applications. Overall, our\nstudy highlights the potential of neuromorphic processors for efficient and\naccurate data processing on resource-constrained devices.\n","authors":["Gavin Parpart","Sumedh R. Risbud","Garrett T. Kenyon","Yijing Watkins"],"pdf_url":"https://arxiv.org/pdf/2307.13762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13756v1","updated":"2023-07-25T18:28:19Z","published":"2023-07-25T18:28:19Z","title":"PlaneRecTR: Unified Query learning for 3D Plane Recovery from a Single\n  View","summary":"  3D plane recovery from a single image can usually be divided into several\nsubtasks of plane detection, segmentation, parameter estimation and possibly\ndepth estimation. Previous works tend to solve this task by either extending\nthe RCNN-based segmentation network or the dense pixel embedding-based\nclustering framework. However, none of them tried to integrate above related\nsubtasks into a unified framework but treat them separately and sequentially,\nwhich we suspect is potentially a main source of performance limitation for\nexisting approaches. Motivated by this finding and the success of query-based\nlearning in enriching reasoning among semantic entities, in this paper, we\npropose PlaneRecTR, a Transformer-based architecture, which for the first time\nunifies all subtasks related to single-view plane recovery with a single\ncompact model. Extensive quantitative and qualitative experiments demonstrate\nthat our proposed unified learning achieves mutual benefits across subtasks,\nobtaining a new state-of-the-art performance on public ScanNet and NYUv2-Plane\ndatasets. Codes are available at https://github.com/SJingjia/PlaneRecTR.\n","authors":["Jingjia Shi","Shuaifeng Zhi","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2307.13756v1.pdf","comment":"To be published in Proceedings of IEEE International Conference on\n  Computer Vision (ICCV 2023). Codes: https://github.com/SJingjia/PlaneRecTR ,\n  Video: https://youtu.be/YBB7totHGJg"},{"id":"http://arxiv.org/abs/2307.13755v1","updated":"2023-07-25T18:26:22Z","published":"2023-07-25T18:26:22Z","title":"TMR-RD: Training-based Model Refinement and Representation Disagreement\n  for Semi-Supervised Object Detection","summary":"  Semi-supervised object detection (SSOD) can incorporate limited labeled data\nand large amounts of unlabeled data to improve the performance and\ngeneralization of existing object detectors. Despite many advances, recent SSOD\nmethods are still challenged by noisy/misleading pseudo-labels, classical\nexponential moving average (EMA) strategy, and the consensus of Teacher-Student\nmodels in the latter stages of training. This paper proposes a novel\ntraining-based model refinement (TMR) stage and a simple yet effective\nrepresentation disagreement (RD) strategy to address the limitations of\nclassical EMA and the consensus problem. The TMR stage of Teacher-Student\nmodels optimizes the lightweight scaling operation to refine the model's\nweights and prevent overfitting or forgetting learned patterns from unlabeled\ndata. Meanwhile, the RD strategy helps keep these models diverged to encourage\nthe student model to explore complementary representations. In addition, we use\ncascade regression to generate more reliable pseudo-labels for supervising the\nstudent model. Extensive experiments demonstrate the superior performance of\nour approach over state-of-the-art SSOD methods. Specifically, the proposed\napproach outperforms the Unbiased-Teacher method by an average mAP margin of\n4.6% and 5.3% when using partially-labeled and fully-labeled data on the\nMS-COCO dataset, respectively.\n","authors":["Seyed Mojtaba Marvasti-Zadeh","Nilanjan Ray","Nadir Erbilgin"],"pdf_url":"https://arxiv.org/pdf/2307.13755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13746v1","updated":"2023-07-25T18:04:52Z","published":"2023-07-25T18:04:52Z","title":"ChildGAN: Large Scale Synthetic Child Facial Data Using Domain\n  Adaptation in StyleGAN","summary":"  In this research work, we proposed a novel ChildGAN, a pair of GAN networks\nfor generating synthetic boys and girls facial data derived from StyleGAN2.\nChildGAN is built by performing smooth domain transfer using transfer learning.\nIt provides photo-realistic, high-quality data samples. A large-scale dataset\nis rendered with a variety of smart facial transformations: facial expressions,\nage progression, eye blink effects, head pose, skin and hair color variations,\nand variable lighting conditions. The dataset comprises more than 300k distinct\ndata samples. Further, the uniqueness and characteristics of the rendered\nfacial features are validated by running different computer vision application\ntests which include CNN-based child gender classifier, face localization and\nfacial landmarks detection test, identity similarity evaluation using ArcFace,\nand lastly running eye detection and eye aspect ratio tests. The results\ndemonstrate that synthetic child facial data of high quality offers an\nalternative to the cost and complexity of collecting a large-scale dataset from\nreal children.\n","authors":["Muhammad Ali Farooq","Wang Yao","Gabriel Costache","Peter Corcoran"],"pdf_url":"https://arxiv.org/pdf/2307.13746v1.pdf","comment":"The Paper is submitted in IEEE Access Journal"},{"id":"http://arxiv.org/abs/2307.13721v1","updated":"2023-07-25T17:59:18Z","published":"2023-07-25T17:59:18Z","title":"Foundational Models Defining a New Era in Vision: A Survey and Outlook","summary":"  Vision systems to see and reason about the compositional nature of visual\nscenes are fundamental to understanding our world. The complex relations\nbetween objects and their locations, ambiguities, and variations in the\nreal-world environment can be better described in human language, naturally\ngoverned by grammatical rules and other modalities such as audio and depth. The\nmodels learned to bridge the gap between such modalities coupled with\nlarge-scale training data facilitate contextual reasoning, generalization, and\nprompt capabilities at test time. These models are referred to as foundational\nmodels. The output of such models can be modified through human-provided\nprompts without retraining, e.g., segmenting a particular object by providing a\nbounding box, having interactive dialogues by asking questions about an image\nor video scene or manipulating the robot's behavior through language\ninstructions. In this survey, we provide a comprehensive review of such\nemerging foundational models, including typical architecture designs to combine\ndifferent modalities (vision, text, audio, etc), training objectives\n(contrastive, generative), pre-training datasets, fine-tuning mechanisms, and\nthe common prompting patterns; textual, visual, and heterogeneous. We discuss\nthe open challenges and research directions for foundational models in computer\nvision, including difficulties in their evaluations and benchmarking, gaps in\ntheir real-world understanding, limitations of their contextual understanding,\nbiases, vulnerability to adversarial attacks, and interpretability issues. We\nreview recent developments in this field, covering a wide range of applications\nof foundation models systematically and comprehensively. A comprehensive list\nof foundational models studied in this work is available at\n\\url{https://github.com/awaisrauf/Awesome-CV-Foundational-Models}.\n","authors":["Muhammad Awais","Muzammal Naseer","Salman Khan","Rao Muhammad Anwer","Hisham Cholakkal","Mubarak Shah","Ming-Hsuan Yang","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2307.13721v1.pdf","comment":"Project page:\n  https://github.com/awaisrauf/Awesome-CV-Foundational-Models"},{"id":"http://arxiv.org/abs/2307.13720v1","updated":"2023-07-25T17:58:43Z","published":"2023-07-25T17:58:43Z","title":"Composite Diffusion | whole >= Σparts","summary":"  For an artist or a graphic designer, the spatial layout of a scene is a\ncritical design choice. However, existing text-to-image diffusion models\nprovide limited support for incorporating spatial information. This paper\nintroduces Composite Diffusion as a means for artists to generate high-quality\nimages by composing from the sub-scenes. The artists can specify the\narrangement of these sub-scenes through a flexible free-form segment layout.\nThey can describe the content of each sub-scene primarily using natural text\nand additionally by utilizing reference images or control inputs such as line\nart, scribbles, human pose, canny edges, and more.\n  We provide a comprehensive and modular method for Composite Diffusion that\nenables alternative ways of generating, composing, and harmonizing sub-scenes.\nFurther, we wish to evaluate the composite image for effectiveness in both\nimage quality and achieving the artist's intent. We argue that existing image\nquality metrics lack a holistic evaluation of image composites. To address\nthis, we propose novel quality criteria especially relevant to composite\ngeneration.\n  We believe that our approach provides an intuitive method of art creation.\nThrough extensive user surveys, quantitative and qualitative analysis, we show\nhow it achieves greater spatial, semantic, and creative control over image\ngeneration. In addition, our methods do not need to retrain or modify the\narchitecture of the base diffusion models and can work in a plug-and-play\nmanner with the fine-tuned models.\n","authors":["Vikram Jamwal","Ramaneswaran S"],"pdf_url":"https://arxiv.org/pdf/2307.13720v1.pdf","comment":"44 pages"},{"id":"http://arxiv.org/abs/2307.13717v1","updated":"2023-07-25T17:29:32Z","published":"2023-07-25T17:29:32Z","title":"A Comprehensive Analysis on the Leakage of Fuzzy Matchers","summary":"  The present paper presents a comprehensive analysis of potential information\nleakage in distance evaluation, with a specific emphasis on threshold-based\nobfuscated distance (i.e. Fuzzy Matcher). It includes detailed descriptions of\nvarious situations related to potential information leakage and specific\nattention is given to their consequences on security. Generic attacks\ncorresponding to each scenario are outlined, and their complexities are\nassessed. The main contribution of this work lies in providing an upper bound\non the security of a fuzzy matcher in scenarios where there is additional\ninformation leakage from the matcher, providing a straightforward understanding\nof the maximum level of achievable security and its potential implications for\ndata privacy and security.\n","authors":["Axel Durbet","Paul-Marie Grollemund","Kevin Thiry-Atighehchi"],"pdf_url":"https://arxiv.org/pdf/2307.13717v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.13632v1","updated":"2023-07-25T16:31:59Z","published":"2023-07-25T16:31:59Z","title":"Mitigating Mainstream Bias in Recommendation via Cost-sensitive Learning","summary":"  Mainstream bias, where some users receive poor recommendations because their\npreferences are uncommon or simply because they are less active, is an\nimportant aspect to consider regarding fairness in recommender systems.\nExisting methods to mitigate mainstream bias do not explicitly model the\nimportance of these non-mainstream users or, when they do, it is in a way that\nis not necessarily compatible with the data and recommendation model at hand.\nIn contrast, we use the recommendation utility as a more generic and implicit\nproxy to quantify mainstreamness, and propose a simple user-weighting approach\nto incorporate it into the training process while taking the cost of potential\nrecommendation errors into account. We provide extensive experimental results\nshowing that quantifying mainstreamness via utility is better able at\nidentifying non-mainstream users, and that they are indeed better served when\ntraining the model in a cost-sensitive way. This is achieved with negligible or\nno loss in overall recommendation accuracy, meaning that the models learn a\nbetter balance across users. In addition, we show that research of this kind,\nwhich evaluates recommendation quality at the individual user level, may not be\nreliable if not using enough interactions when assessing model performance.\n","authors":["Roger Zhe Li","Julián Urbano","Alan Hanjalic"],"pdf_url":"https://arxiv.org/pdf/2307.13632v1.pdf","comment":"8 pages, 7 figures, accepted to ICTIR'23"},{"id":"http://arxiv.org/abs/2307.08303v2","updated":"2023-07-25T14:57:05Z","published":"2023-07-17T07:55:47Z","title":"Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language\n  Models","summary":"  Dense retrieval (DR) converts queries and documents into dense embeddings and\nmeasures the similarity between queries and documents in vector space. One of\nthe challenges in DR is the lack of domain-specific training data. While DR\nmodels can learn from large-scale public datasets like MS MARCO through\ntransfer learning, evidence shows that not all DR models and domains can\nbenefit from transfer learning equally. Recently, some researchers have\nresorted to large language models (LLMs) to improve the zero-shot and few-shot\nDR models. However, the hard prompts or human-written prompts utilized in these\nworks cannot guarantee the good quality of generated weak queries. To tackle\nthis, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,\nwe leverage soft prompt-tuning to optimize a task-specific soft prompt on\nlimited ground truth data and then prompt the LLMs to tag unlabeled documents\nwith weak queries, yielding enough weak document-query pairs to train\ntask-specific dense retrievers. We design a filter to select high-quality\nexample document-query pairs in the prompt to further improve the quality of\nweak tagged queries. To the best of our knowledge, there is no prior work\nutilizing soft prompt tuning to augment DR models. The experiments demonstrate\nthat SPTAR outperforms the unsupervised baselines BM25 and the recently\nproposed LLMs-based augmentation method for DR.\n","authors":["Zhiyuan Peng","Xuyang Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2307.08303v2.pdf","comment":"fix typo InPairs which should be InPars"},{"id":"http://arxiv.org/abs/2307.13468v1","updated":"2023-07-25T12:56:41Z","published":"2023-07-25T12:56:41Z","title":"Gaussian Graph with Prototypical Contrastive Learning in E-Commerce\n  Bundle Recommendation","summary":"  Bundle recommendation aims to provide a bundle of items to satisfy the user\npreference on e-commerce platform. Existing successful solutions are based on\nthe contrastive graph learning paradigm where graph neural networks (GNNs) are\nemployed to learn representations from user-level and bundle-level graph views\nwith a contrastive learning module to enhance the cooperative association\nbetween different views. Nevertheless, they ignore the uncertainty issue which\nhas a significant impact in real bundle recommendation scenarios due to the\nlack of discriminative information caused by highly sparsity or diversity. We\nfurther suggest that their instancewise contrastive learning fails to\ndistinguish the semantically similar negatives (i.e., sampling bias issue),\nresulting in performance degradation. In this paper, we propose a novel\nGaussian Graph with Prototypical Contrastive Learning (GPCL) framework to\novercome these challenges. In particular, GPCL embeds each user/bundle/item as\na Gaussian distribution rather than a fixed vector. We further design a\nprototypical contrastive learning module to capture the contextual information\nand mitigate the sampling bias issue. Extensive experiments demonstrate that\nbenefiting from the proposed components, we achieve new state-of-the-art\nperformance compared to previous methods on several public datasets. Moreover,\nGPCL has been deployed on real-world e-commerce platform and achieved\nsubstantial improvements.\n","authors":["Zhao-Yang Liu","Liucheng Sun","Chenwei Weng","Qijin Chen","Chengfu Huo"],"pdf_url":"https://arxiv.org/pdf/2307.13468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13427v1","updated":"2023-07-25T11:49:25Z","published":"2023-07-25T11:49:25Z","title":"Comprehensive Review on Semantic Information Retrieval and Ontology\n  Engineering","summary":"  Situation awareness is a crucial cognitive skill that enables individuals to\nperceive, comprehend, and project the current state of their environment\naccurately. It involves being conscious of relevant information, understanding\nits meaning, and using that understanding to make well-informed decisions.\nAwareness systems often need to integrate new knowledge and adapt to changing\nenvironments. Ontology reasoning facilitates knowledge integration and\nevolution, allowing for seamless updates and expansions of the ontology. With\nthe consideration of above, we are providing a quick review on semantic\ninformation retrieval and ontology engineering to understand the emerging\nchallenges and future research. In the review we have found that the ontology\nreasoning addresses the limitations of traditional systems by providing a\nformal, flexible, and scalable framework for knowledge representation,\nreasoning, and inference.\n","authors":["Sumit Sharma","Sarika Jain"],"pdf_url":"https://arxiv.org/pdf/2307.13427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13394v1","updated":"2023-07-25T10:27:00Z","published":"2023-07-25T10:27:00Z","title":"An End-to-End Workflow using Topic Segmentation and Text Summarisation\n  Methods for Improved Podcast Comprehension","summary":"  The consumption of podcast media has been increasing rapidly. Due to the\nlengthy nature of podcast episodes, users often carefully select which ones to\nlisten to. Although episode descriptions aid users by providing a summary of\nthe entire podcast, they do not provide a topic-by-topic breakdown. This study\nexplores the combined application of topic segmentation and text summarisation\nmethods to investigate how podcast episode comprehension can be improved. We\nhave sampled 10 episodes from Spotify's English-Language Podcast Dataset and\nemployed TextTiling and TextSplit to segment them. Moreover, three text\nsummarisation models, namely T5, BART, and Pegasus, were applied to provide a\nvery short title for each segment. The segmentation part was evaluated using\nour annotated sample with the $P_k$ and WindowDiff ($WD$) metrics. A survey was\nalso rolled out ($N=25$) to assess the quality of the generated summaries. The\nTextSplit algorithm achieved the lowest mean for both evaluation metrics\n($\\bar{P_k}=0.41$ and $\\bar{WD}=0.41$), while the T5 model produced the best\nsummaries, achieving a relevancy score only $8\\%$ less to the one achieved by\nthe human-written titles.\n","authors":["Andrew Aquilina","Sean Diacono","Panagiotis Papapetrou","Maria Movin"],"pdf_url":"https://arxiv.org/pdf/2307.13394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.11876v4","updated":"2023-07-25T10:13:28Z","published":"2021-05-25T12:23:24Z","title":"Criterion-based Heterogeneous Collaborative Filtering for Multi-behavior\n  Implicit Recommendation","summary":"  Recent years have witnessed the explosive growth of interaction behaviors in\nmultimedia information systems, where multi-behavior recommender systems have\nreceived increasing attention by leveraging data from various auxiliary\nbehaviors such as tip and collect. Among various multi-behavior recommendation\nmethods, non-sampling methods have shown superiority over negative sampling\nmethods. However, two observations are usually ignored in existing\nstate-of-the-art non-sampling methods based on binary regression: (1) users\nhave different preference strengths for different items, so they cannot be\nmeasured simply by binary implicit data; (2) the dependency across multiple\nbehaviors varies for different users and items. To tackle the above issue, we\npropose a novel non-sampling learning framework named Criterion-guided\nHeterogeneous Collaborative Filtering (CHCF). CHCF introduces both upper and\nlower thresholds to indicate selection criteria, which will guide user\npreference learning. Besides, CHCF integrates criterion learning and user\npreference learning into a unified framework, which can be trained jointly for\nthe interaction prediction of the target behavior. We further theoretically\ndemonstrate that the optimization of Collaborative Metric Learning can be\napproximately achieved by the CHCF learning framework in a non-sampling form\neffectively. Extensive experiments on three real-world datasets show the\neffectiveness of CHCF in heterogeneous scenarios.\n","authors":["Xiao Luo","Daqing Wu","Yiyang Gu","Chong Chen","Luchen Liu","Jinwen Ma","Ming Zhang","Minghua Deng","Jianqiang Huang","Xian-Sheng Hua"],"pdf_url":"https://arxiv.org/pdf/2105.11876v4.pdf","comment":"Accepted by ACM Transactions on Knowledge Discovery from Data (TKDD)"},{"id":"http://arxiv.org/abs/2307.13377v1","updated":"2023-07-25T09:51:17Z","published":"2023-07-25T09:51:17Z","title":"Embedding Models for Supervised Automatic Extraction and Classification\n  of Named Entities in Scientific Acknowledgements","summary":"  Acknowledgments in scientific papers may give an insight into aspects of the\nscientific community, such as reward systems, collaboration patterns, and\nhidden research trends. The aim of the paper is to evaluate the performance of\ndifferent embedding models for the task of automatic extraction and\nclassification of acknowledged entities from the acknowledgment text in\nscientific papers. We trained and implemented a named entity recognition (NER)\ntask using the Flair NLP framework. The training was conducted using three\ndefault Flair NER models with four differently-sized corpora and different\nversions of the Flair NLP framework. The Flair Embeddings model trained on the\nmedium corpus with the latest FLAIR version showed the best accuracy of 0.79.\nExpanding the size of a training corpus from very small to medium size\nmassively increased the accuracy of all training algorithms, but further\nexpansion of the training corpus did not bring further improvement. Moreover,\nthe performance of the model slightly deteriorated. Our model is able to\nrecognize six entity types: funding agency, grant number, individuals,\nuniversity, corporation, and miscellaneous. The model works more precisely for\nsome entity types than for others; thus, individuals and grant numbers showed a\nvery good F1-Score over 0.9. Most of the previous works on acknowledgment\nanalysis were limited by the manual evaluation of data and therefore by the\namount of processed data. This model can be applied for the comprehensive\nanalysis of acknowledgment texts and may potentially make a great contribution\nto the field of automated acknowledgment analysis.\n","authors":["Nina Smirnova","Philipp Mayr"],"pdf_url":"https://arxiv.org/pdf/2307.13377v1.pdf","comment":"The present paper is an extended version of the article Evaluation of\n  Embedding Models for Automatic Extraction and Classification of Acknowledged\n  Entities in Scientific Documents (Smirnova and Mayr, 2022) presented at the\n  3rd Workshop on Extraction and Evaluation of Knowledge Entities from\n  Scientific Documents (EEKE2022). arXiv admin note: substantial text overlap\n  with arXiv:2206.10939"},{"id":"http://arxiv.org/abs/2307.13298v1","updated":"2023-07-25T07:27:32Z","published":"2023-07-25T07:27:32Z","title":"An Intent Taxonomy of Legal Case Retrieval","summary":"  Legal case retrieval is a special Information Retrieval~(IR) task focusing on\nlegal case documents. Depending on the downstream tasks of the retrieved case\ndocuments, users' information needs in legal case retrieval could be\nsignificantly different from those in Web search and traditional ad-hoc\nretrieval tasks. While there are several studies that retrieve legal cases\nbased on text similarity, the underlying search intents of legal retrieval\nusers, as shown in this paper, are more complicated than that yet mostly\nunexplored. To this end, we present a novel hierarchical intent taxonomy of\nlegal case retrieval. It consists of five intent types categorized by three\ncriteria, i.e., search for Particular Case(s), Characterization, Penalty,\nProcedure, and Interest. The taxonomy was constructed transparently and\nevaluated extensively through interviews, editorial user studies, and query log\nanalysis. Through a laboratory user study, we reveal significant differences in\nuser behavior and satisfaction under different search intents in legal case\nretrieval. Furthermore, we apply the proposed taxonomy to various downstream\nlegal retrieval tasks, e.g., result ranking and satisfaction prediction, and\ndemonstrate its effectiveness. Our work provides important insights into the\nunderstanding of user intents in legal case retrieval and potentially leads to\nbetter retrieval techniques in the legal domain, such as intent-aware ranking\nstrategies and evaluation methodologies.\n","authors":["Yunqiu Shao","Haitao Li","Yueyue Wu","Yiqun Liu","Qingyao Ai","Jiaxin Mao","Yixiao Ma","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2307.13298v1.pdf","comment":"28 pages, work in process"},{"id":"http://arxiv.org/abs/2307.12798v2","updated":"2023-07-25T05:42:34Z","published":"2023-07-24T13:51:19Z","title":"RRAML: Reinforced Retrieval Augmented Machine Learning","summary":"  The emergence of large language models (LLMs) has revolutionized machine\nlearning and related fields, showcasing remarkable abilities in comprehending,\ngenerating, and manipulating human language. However, their conventional usage\nthrough API-based text prompt submissions imposes certain limitations in terms\nof context constraints and external source availability. To address these\nchallenges, we propose a novel framework called Reinforced Retrieval Augmented\nMachine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs\nwith supporting information retrieved by a purpose-built retriever from a vast\nuser-provided database. By leveraging recent advancements in reinforcement\nlearning, our method effectively addresses several critical challenges.\nFirstly, it circumvents the need for accessing LLM gradients. Secondly, our\nmethod alleviates the burden of retraining LLMs for specific tasks, as it is\noften impractical or impossible due to restricted access to the model and the\ncomputational intensity involved. Additionally we seamlessly link the\nretriever's task with the reasoner, mitigating hallucinations and reducing\nirrelevant, and potentially damaging retrieved documents. We believe that the\nresearch agenda outlined in this paper has the potential to profoundly impact\nthe field of AI, democratizing access to and utilization of LLMs for a wide\nrange of entities.\n","authors":["Andrea Bacciu","Florin Cocunasu","Federico Siciliano","Fabrizio Silvestri","Nicola Tonellotto","Giovanni Trappolini"],"pdf_url":"https://arxiv.org/pdf/2307.12798v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12424v2","updated":"2023-07-25T18:56:03Z","published":"2023-07-23T20:34:18Z","title":"Interface Design to Mitigate Inflation in Recommender Systems","summary":"  Recommendation systems rely on user-provided data to learn about item quality\nand provide personalized recommendations. An implicit assumption when\naggregating ratings into item quality is that ratings are strong indicators of\nitem quality. In this work, we test this assumption using data collected from a\nmusic discovery application. Our study focuses on two factors that cause rating\ninflation: heterogeneous user rating behavior and the dynamics of personalized\nrecommendations. We show that user rating behavior substantially varies by\nuser, leading to item quality estimates that reflect the users who rated an\nitem more than the item quality itself. Additionally, items that are more\nlikely to be shown via personalized recommendations can experience a\nsubstantial increase in their exposure and potential bias toward them. To\nmitigate these effects, we analyze the results of a randomized controlled trial\nin which the rating interface was modified. The test resulted in a substantial\nimprovement in user rating behavior and a reduction in item quality inflation.\nThese findings highlight the importance of carefully considering the\nassumptions underlying recommendation systems and designing interfaces that\nencourage accurate rating behavior.\n","authors":["Rana Shahout","Yehonatan Peisakhovsky","Sasha Stoikov","Nikhil Garg"],"pdf_url":"https://arxiv.org/pdf/2307.12424v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13766v1","updated":"2023-07-25T18:53:24Z","published":"2023-07-25T18:53:24Z","title":"ClusterSeq: Enhancing Sequential Recommender Systems with Clustering\n  based Meta-Learning","summary":"  In practical scenarios, the effectiveness of sequential recommendation\nsystems is hindered by the user cold-start problem, which arises due to limited\ninteractions for accurately determining user preferences. Previous studies have\nattempted to address this issue by combining meta-learning with user and\nitem-side information. However, these approaches face inherent challenges in\nmodeling user preference dynamics, particularly for \"minor users\" who exhibit\ndistinct preferences compared to more common or \"major users.\" To overcome\nthese limitations, we present a novel approach called ClusterSeq, a\nMeta-Learning Clustering-Based Sequential Recommender System. ClusterSeq\nleverages dynamic information in the user sequence to enhance item prediction\naccuracy, even in the absence of side information. This model preserves the\npreferences of minor users without being overshadowed by major users, and it\ncapitalizes on the collective knowledge of users within the same cluster.\nExtensive experiments conducted on various benchmark datasets validate the\neffectiveness of ClusterSeq. Empirical results consistently demonstrate that\nClusterSeq outperforms several state-of-the-art meta-learning recommenders.\nNotably, compared to existing meta-learning methods, our proposed approach\nachieves a substantial improvement of 16-39% in Mean Reciprocal Rank (MRR).\n","authors":["Mohammmadmahdi Maheri","Reza Abdollahzadeh","Bardia Mohammadi","Mina Rafiei","Jafar Habibi","Hamid R. Rabiee"],"pdf_url":"https://arxiv.org/pdf/2307.13766v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.13692v1","updated":"2023-07-25T17:55:19Z","published":"2023-07-25T17:55:19Z","title":"ARB: Advanced Reasoning Benchmark for Large Language Models","summary":"  Large Language Models (LLMs) have demonstrated remarkable performance on\nvarious quantitative reasoning and knowledge benchmarks. However, many of these\nbenchmarks are losing utility as LLMs get increasingly high scores, despite not\nyet reaching expert performance in these domains. We introduce ARB, a novel\nbenchmark composed of advanced reasoning problems in multiple fields. ARB\npresents a more challenging test than prior benchmarks, featuring problems in\nmathematics, physics, biology, chemistry, and law. As a subset of ARB, we\nintroduce a challenging set of math and physics problems which require advanced\nsymbolic reasoning and domain knowledge. We evaluate recent models such as\nGPT-4 and Claude on ARB and demonstrate that current models score well below\n50% on more demanding tasks. In order to improve both automatic and assisted\nevaluation capabilities, we introduce a rubric-based evaluation approach,\nallowing GPT-4 to score its own intermediate reasoning steps. Further, we\nconduct a human evaluation of the symbolic subset of ARB, finding promising\nagreement between annotators and GPT-4 rubric evaluation scores.\n","authors":["Tomohiro Sawada","Daniel Paleka","Alexander Havrilla","Pranav Tadepalli","Paula Vidas","Alexander Kranias","John J. Nay","Kshitij Gupta","Aran Komatsuzaki"],"pdf_url":"https://arxiv.org/pdf/2307.13692v1.pdf","comment":"Submitted to NeurIPS Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/1905.10448v4","updated":"2023-07-25T17:53:01Z","published":"2019-05-24T21:19:04Z","title":"Geometric Wavelet Scattering Networks on Compact Riemannian Manifolds","summary":"  The Euclidean scattering transform was introduced nearly a decade ago to\nimprove the mathematical understanding of convolutional neural networks.\nInspired by recent interest in geometric deep learning, which aims to\ngeneralize convolutional neural networks to manifold and graph-structured\ndomains, we define a geometric scattering transform on manifolds. Similar to\nthe Euclidean scattering transform, the geometric scattering transform is based\non a cascade of wavelet filters and pointwise nonlinearities. It is invariant\nto local isometries and stable to certain types of diffeomorphisms. Empirical\nresults demonstrate its utility on several geometric learning tasks. Our\nresults generalize the deformation stability and local translation invariance\nof Euclidean scattering, and demonstrate the importance of linking the used\nfilter structures to the underlying geometry of the data.\n","authors":["Michael Perlmutter","Feng Gao","Guy Wolf","Matthew Hirn"],"pdf_url":"https://arxiv.org/pdf/1905.10448v4.pdf","comment":"35 pages; 3 figures; 2 tables; v4: Fixed a minor error. Convergence\n  in Equation 13 is in L2 not p.w. modified proof of Theorem 3.3 accordingly"},{"id":"http://arxiv.org/abs/2303.06296v2","updated":"2023-07-25T17:42:37Z","published":"2023-03-11T03:30:47Z","title":"Stabilizing Transformer Training by Preventing Attention Entropy\n  Collapse","summary":"  Training stability is of great importance to Transformers. In this work, we\ninvestigate the training dynamics of Transformers by examining the evolution of\nthe attention layers. In particular, we track the attention entropy for each\nattention head during the course of training, which is a proxy for model\nsharpness. We identify a common pattern across different architectures and\ntasks, where low attention entropy is accompanied by high training instability,\nwhich can take the form of oscillating loss or divergence. We denote the\npathologically low attention entropy, corresponding to highly concentrated\nattention scores, as $\\textit{entropy collapse}$. As a remedy, we propose\n$\\sigma$Reparam, a simple and efficient solution where we reparametrize all\nlinear layers with spectral normalization and an additional learned scalar. We\ndemonstrate that $\\sigma$Reparam successfully prevents entropy collapse in the\nattention layers, promoting more stable training. Additionally, we prove a\ntight lower bound of the attention entropy, which decreases exponentially fast\nwith the spectral norm of the attention logits, providing additional motivation\nfor our approach. We conduct experiments with $\\sigma$Reparam on image\nclassification, image self-supervised learning, machine translation, speech\nrecognition, and language modeling tasks. We show that $\\sigma$Reparam provides\nstability and robustness with respect to the choice of hyperparameters, going\nso far as enabling training (a) a Vision Transformer {to competitive\nperformance} without warmup, weight decay, layer normalization or adaptive\noptimizers; (b) deep architectures in machine translation and (c) speech\nrecognition to competitive performance without warmup and adaptive optimizers.\nCode is available at \\url{https://github.com/apple/ml-sigma-reparam}.\n","authors":["Shuangfei Zhai","Tatiana Likhomanenko","Etai Littwin","Dan Busbridge","Jason Ramapuram","Yizhe Zhang","Jiatao Gu","Josh Susskind"],"pdf_url":"https://arxiv.org/pdf/2303.06296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13680v1","updated":"2023-07-25T17:36:56Z","published":"2023-07-25T17:36:56Z","title":"High Probability Analysis for Non-Convex Stochastic Optimization with\n  Clipping","summary":"  Gradient clipping is a commonly used technique to stabilize the training\nprocess of neural networks. A growing body of studies has shown that gradient\nclipping is a promising technique for dealing with the heavy-tailed behavior\nthat emerged in stochastic optimization as well. While gradient clipping is\nsignificant, its theoretical guarantees are scarce. Most theoretical guarantees\nonly provide an in-expectation analysis and only focus on optimization\nperformance. In this paper, we provide high probability analysis in the\nnon-convex setting and derive the optimization bound and the generalization\nbound simultaneously for popular stochastic optimization algorithms with\ngradient clipping, including stochastic gradient descent and its variants of\nmomentum and adaptive stepsizes. With the gradient clipping, we study a\nheavy-tailed assumption that the gradients only have bounded $\\alpha$-th\nmoments for some $\\alpha \\in (1, 2]$, which is much weaker than the standard\nbounded second-moment assumption. Overall, our study provides a relatively\ncomplete picture for the theoretical guarantee of stochastic optimization\nalgorithms with clipping.\n","authors":["Shaojie Li","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13679v1","updated":"2023-07-25T17:36:34Z","published":"2023-07-25T17:36:34Z","title":"RED CoMETS: An ensemble classifier for symbolically represented\n  multivariate time series","summary":"  Multivariate time series classification is a rapidly growing research field\nwith practical applications in finance, healthcare, engineering, and more. The\ncomplexity of classifying multivariate time series data arises from its high\ndimensionality, temporal dependencies, and varying lengths. This paper\nintroduces a novel ensemble classifier called RED CoMETS (Random Enhanced\nCo-eye for Multivariate Time Series), which addresses these challenges. RED\nCoMETS builds upon the success of Co-eye, an ensemble classifier specifically\ndesigned for symbolically represented univariate time series, and extends its\ncapabilities to handle multivariate data. The performance of RED CoMETS is\nevaluated on benchmark datasets from the UCR archive, where it demonstrates\ncompetitive accuracy when compared to state-of-the-art techniques in\nmultivariate settings. Notably, it achieves the highest reported accuracy in\nthe literature for the 'HandMovementDirection' dataset. Moreover, the proposed\nmethod significantly reduces computation time compared to Co-eye, making it an\nefficient and effective choice for multivariate time series classification.\n","authors":["Luca A. Bennett","Zahraa S. Abdallah"],"pdf_url":"https://arxiv.org/pdf/2307.13679v1.pdf","comment":"Accepted by AALTD 2023"},{"id":"http://arxiv.org/abs/2307.12840v2","updated":"2023-07-25T17:25:03Z","published":"2023-07-24T14:37:22Z","title":"Efficiently Learning One-Hidden-Layer ReLU Networks via Schur\n  Polynomials","summary":"  We study the problem of PAC learning a linear combination of $k$ ReLU\nactivations under the standard Gaussian distribution on $\\mathbb{R}^d$ with\nrespect to the square loss. Our main result is an efficient algorithm for this\nlearning task with sample and computational complexity $(dk/\\epsilon)^{O(k)}$,\nwhere $\\epsilon>0$ is the target accuracy. Prior work had given an algorithm\nfor this problem with complexity $(dk/\\epsilon)^{h(k)}$, where the function\n$h(k)$ scales super-polynomially in $k$. Interestingly, the complexity of our\nalgorithm is near-optimal within the class of Correlational Statistical Query\nalgorithms. At a high-level, our algorithm uses tensor decomposition to\nidentify a subspace such that all the $O(k)$-order moments are small in the\northogonal directions. Its analysis makes essential use of the theory of Schur\npolynomials to show that the higher-moment error tensors are small given that\nthe lower-order ones are.\n","authors":["Ilias Diakonikolas","Daniel M. Kane"],"pdf_url":"https://arxiv.org/pdf/2307.12840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00296v2","updated":"2023-07-25T17:15:23Z","published":"2023-07-01T10:39:07Z","title":"Accelerated primal-dual methods with enlarged step sizes and operator\n  learning for nonsmooth optimal control problems","summary":"  We consider a general class of nonsmooth optimal control problems with\npartial differential equation (PDE) constraints, which are very challenging due\nto its nonsmooth objective functionals and the resulting high-dimensional and\nill-conditioned systems after discretization. We focus on the application of a\nprimal-dual method, with which different types of variables can be treated\nindividually and thus its main computation at each iteration only requires\nsolving two PDEs. Our target is to accelerate the primal-dual method with\neither larger step sizes or operator learning techniques. For the accelerated\nprimal-dual method with larger step sizes, its convergence can be still proved\nrigorously while it numerically accelerates the original primal-dual method in\na simple and universal way. For the operator learning acceleration, we\nconstruct deep neural network surrogate models for the involved PDEs. Once a\nneural operator is learned, solving a PDE requires only a forward pass of the\nneural network, and the computational cost is thus substantially reduced. The\naccelerated primal-dual method with operator learning is mesh-free, numerically\nefficient, and scalable to different types of PDEs. The acceleration\neffectiveness of these two techniques is promisingly validated by some\npreliminary numerical results.\n","authors":["Yongcun Song","Xiaoming Yuan","Hangrui Yue"],"pdf_url":"https://arxiv.org/pdf/2307.00296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07679v2","updated":"2023-07-25T17:12:57Z","published":"2023-07-15T01:53:09Z","title":"Sharp Convergence Rates for Matching Pursuit","summary":"  We study the fundamental limits of matching pursuit, or the pure greedy\nalgorithm, for approximating a target function by a sparse linear combination\nof elements from a dictionary. When the target function is contained in the\nvariation space corresponding to the dictionary, many impressive works over the\npast few decades have obtained upper and lower bounds on the error of matching\npursuit, but they do not match. The main contribution of this paper is to close\nthis gap and obtain a sharp characterization of the decay rate of matching\npursuit. Specifically, we construct a worst case dictionary which shows that\nthe existing best upper bound cannot be significantly improved. It turns out\nthat, unlike other greedy algorithm variants, the converge rate is suboptimal\nand is determined by the solution to a certain non-linear equation. This\nenables us to conclude that any amount of shrinkage improves matching pursuit\nin the worst case.\n","authors":["Jason M. Klusowski","Jonathan W. Siegel"],"pdf_url":"https://arxiv.org/pdf/2307.07679v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13658v1","updated":"2023-07-25T17:09:28Z","published":"2023-07-25T17:09:28Z","title":"Towards an AI Accountability Policy","summary":"  This white paper is a response to the \"AI Accountability Policy Request for\nComments\" by the National Telecommunications and Information Administration of\nthe United States. The question numbers for which comments were requested are\nprovided in superscripts at the end of key sentences answering the respective\nquestions. The white paper offers a set of interconnected recommendations for\nan AI accountability policy.\n","authors":["Przemyslaw Grabowicz","Nicholas Perello","Yair Zick"],"pdf_url":"https://arxiv.org/pdf/2307.13658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05796v3","updated":"2023-07-25T16:57:52Z","published":"2022-12-12T09:43:26Z","title":"Generalizing DP-SGD with Shuffling and Batch Clipping","summary":"  Classical differential private DP-SGD implements individual clipping with\nrandom subsampling, which forces a mini-batch SGD approach. We provide a\ngeneral differential private algorithmic framework that goes beyond DP-SGD and\nallows any possible first order optimizers (e.g., classical SGD and momentum\nbased SGD approaches) in combination with batch clipping, which clips an\naggregate of computed gradients rather than summing clipped gradients (as is\ndone in individual clipping). The framework also admits sampling techniques\nbeyond random subsampling such as shuffling. Our DP analysis follows the $f$-DP\napproach and introduces a new proof technique which allows us to derive simple\nclosed form expressions and to also analyse group privacy. In particular, for\n$E$ epochs work and groups of size $g$, we show a $\\sqrt{g E}$ DP dependency\nfor batch clipping with shuffling.\n","authors":["Marten van Dijk","Phuong Ha Nguyen","Toan N. Nguyen","Lam M. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2212.05796v3.pdf","comment":"Update disclaimers"},{"id":"http://arxiv.org/abs/2307.13642v1","updated":"2023-07-25T16:49:54Z","published":"2023-07-25T16:49:54Z","title":"Safety Margins for Reinforcement Learning","summary":"  Any autonomous controller will be unsafe in some situations. The ability to\nquantitatively identify when these unsafe situations are about to occur is\ncrucial for drawing timely human oversight in, e.g., freight transportation\napplications. In this work, we demonstrate that the true criticality of an\nagent's situation can be robustly defined as the mean reduction in reward given\nsome number of random actions. Proxy criticality metrics that are computable in\nreal-time (i.e., without actually simulating the effects of random actions) can\nbe compared to the true criticality, and we show how to leverage these proxy\nmetrics to generate safety margins, which directly tie the consequences of\npotentially incorrect actions to an anticipated loss in overall performance. We\nevaluate our approach on learned policies from APE-X and A3C within an Atari\nenvironment, and demonstrate how safety margins decrease as agents approach\nfailure states. The integration of safety margins into programs for monitoring\ndeployed agents allows for the real-time identification of potentially\ncatastrophic situations.\n","authors":["Alexander Grushin","Walt Woods","Alvaro Velasquez","Simon Khan"],"pdf_url":"https://arxiv.org/pdf/2307.13642v1.pdf","comment":"2 pages, 2 figures. Presented at the 2023 IEEE Conference on\n  Artificial Intelligence (CAI), Santa Clara, CA"},{"id":"http://arxiv.org/abs/2210.06433v2","updated":"2023-07-25T16:43:33Z","published":"2022-10-12T17:30:12Z","title":"Self-supervised video pretraining yields human-aligned visual\n  representations","summary":"  Humans learn powerful representations of objects and scenes by observing how\nthey evolve over time. Yet, outside of specific tasks that require explicit\ntemporal understanding, static image pretraining remains the dominant paradigm\nfor learning visual foundation models. We question this mismatch, and ask\nwhether video pretraining can yield visual representations that bear the\nhallmarks of human perception: generalisation across tasks, robustness to\nperturbations, and consistency with human judgements. To that end we propose a\nnovel procedure for curating videos, and develop a contrastive framework which\nlearns from the complex transformations therein. This simple paradigm for\ndistilling knowledge from videos, called VITO, yields general representations\nthat far outperform prior video pretraining methods on image understanding\ntasks, and image pretraining methods on video understanding tasks. Moreover,\nVITO representations are significantly more robust to natural and synthetic\ndeformations than image-, video-, and adversarially-trained ones. Finally,\nVITO's predictions are strongly aligned with human judgements, surpassing\nmodels that were specifically trained for that purpose. Together, these results\nsuggest that video pretraining could be a simple way of learning unified,\nrobust, and human-aligned representations of the visual world.\n","authors":["Nikhil Parthasarathy","S. M. Ali Eslami","João Carreira","Olivier J. Hénaff"],"pdf_url":"https://arxiv.org/pdf/2210.06433v2.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2302.11007v2","updated":"2023-07-25T16:38:46Z","published":"2023-02-21T21:20:59Z","title":"Unification of popular artificial neural network activation functions","summary":"  We present a unified representation of the most popular neural network\nactivation functions. Adopting Mittag-Leffler functions of fractional calculus,\nwe propose a flexible and compact functional form that is able to interpolate\nbetween various activation functions and mitigate common problems in training\nneural networks such as vanishing and exploding gradients. The presented gated\nrepresentation extends the scope of fixed-shape activation functions to their\nadaptive counterparts whose shape can be learnt from the training data. The\nderivatives of the proposed functional form can also be expressed in terms of\nMittag-Leffler functions making it a suitable candidate for gradient-based\nbackpropagation algorithms. By training multiple neural networks of different\ncomplexities on various datasets with different sizes, we demonstrate that\nadopting a unified gated representation of activation functions offers a\npromising and affordable alternative to individual built-in implementations of\nactivation functions in conventional machine learning frameworks.\n","authors":["Mohammad Mostafanejad"],"pdf_url":"https://arxiv.org/pdf/2302.11007v2.pdf","comment":"The present revised version includes new results on ShuffleNet-v2 and\n  ResNet-101 neural networks"},{"id":"http://arxiv.org/abs/2306.10797v2","updated":"2023-07-25T16:33:02Z","published":"2023-06-19T09:37:18Z","title":"Variability of echo state network prediction horizon for partially\n  observed dynamical systems","summary":"  Study of dynamical systems using partial state observation is an important\nproblem due to its applicability to many real-world systems. We address the\nproblem by proposing an echo state network (ESN) framework with partial state\ninput with partial or full state output. Application to the Lorenz system and\nChua's oscillator (both numerically simulated and experimental systems)\ndemonstrate the effectiveness of our method. We show that the ESN, as an\nautonomous dynamical system, is capable of making short-term predictions up to\na few Lyapunov times. However, the prediction horizon has high variability\ndepending on the initial condition - an aspect that we explore in detail using\nthe distribution of the prediction horizon. Further, using a variety of\nstatistical metrics to compare the long-term dynamics of the ESN predictions\nwith numerically simulated or experimental dynamics and observed similar\nresults, we show that the ESN can effectively learn the system's dynamics even\nwhen trained with noisy numerical or experimental datasets. Thus, we\ndemonstrate the potential of ESNs to serve as cheap surrogate models for\nsimulating the dynamics of systems where complete observations are unavailable.\n","authors":["Ajit Mahata","Reetish Padhi","Amit Apte"],"pdf_url":"https://arxiv.org/pdf/2306.10797v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13621v1","updated":"2023-07-25T16:23:32Z","published":"2023-07-25T16:23:32Z","title":"Scaling machine learning-based chemical plant simulation: A method for\n  fine-tuning a model to induce stable fixed points","summary":"  Idealized first-principles models of chemical plants can be inaccurate. An\nalternative is to fit a Machine Learning (ML) model directly to plant sensor\ndata. We use a structured approach: Each unit within the plant gets represented\nby one ML model. After fitting the models to the data, the models are connected\ninto a flowsheet-like directed graph. We find that for smaller plants, this\napproach works well, but for larger plants, the complex dynamics arising from\nlarge and nested cycles in the flowsheet lead to instabilities in the cycle\nsolver. We analyze this problem in depth and show that it is not merely a\nspecialized concern but rather a more pervasive challenge that will likely\noccur whenever ML is applied to larger plants. To address this problem, we\npresent a way to fine-tune ML models such that solving cycles with the usual\nmethods becomes robust again.\n","authors":["Malte Esders","Gimmy Alex Fernandez Ramirez","Michael Gastegger","Satya Swarup Samal"],"pdf_url":"https://arxiv.org/pdf/2307.13621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13616v1","updated":"2023-07-25T16:20:56Z","published":"2023-07-25T16:20:56Z","title":"AI and ethics in insurance: a new solution to mitigate proxy\n  discrimination in risk modeling","summary":"  The development of Machine Learning is experiencing growing interest from the\ngeneral public, and in recent years there have been numerous press articles\nquestioning its objectivity: racism, sexism, \\dots Driven by the growing\nattention of regulators on the ethical use of data in insurance, the actuarial\ncommunity must rethink pricing and risk selection practices for fairer\ninsurance. Equity is a philosophy concept that has many different definitions\nin every jurisdiction that influence each other without currently reaching\nconsensus. In Europe, the Charter of Fundamental Rights defines guidelines on\ndiscrimination, and the use of sensitive personal data in algorithms is\nregulated. If the simple removal of the protected variables prevents any\nso-called `direct' discrimination, models are still able to `indirectly'\ndiscriminate between individuals thanks to latent interactions between\nvariables, which bring better performance (and therefore a better\nquantification of risk, segmentation of prices, and so on). After introducing\nthe key concepts related to discrimination, we illustrate the complexity of\nquantifying them. We then propose an innovative method, not yet met in the\nliterature, to reduce the risks of indirect discrimination thanks to\nmathematical concepts of linear algebra. This technique is illustrated in a\nconcrete case of risk selection in life insurance, demonstrating its simplicity\nof use and its promising performance.\n","authors":["Marguerite Sauce","Antoine Chancel","Antoine Ly"],"pdf_url":"https://arxiv.org/pdf/2307.13616v1.pdf","comment":"Preprint - WIP"},{"id":"http://arxiv.org/abs/2206.11715v2","updated":"2023-07-25T16:18:12Z","published":"2022-06-23T14:07:23Z","title":"Deep Reinforcement Learning-Assisted Federated Learning for Robust\n  Short-term Utility Demand Forecasting in Electricity Wholesale Markets","summary":"  Short-term load forecasting (STLF) plays a significant role in the operation\nof electricity trading markets. Considering the growing concern of data\nprivacy, federated learning (FL) is increasingly adopted to train STLF models\nfor utility companies (UCs) in recent research. Inspiringly, in wholesale\nmarkets, as it is not realistic for power plants (PPs) to access UCs' data\ndirectly, FL is definitely a feasible solution of obtaining an accurate STLF\nmodel for PPs. However, due to FL's distributed nature and intense competition\namong UCs, defects increasingly occur and lead to poor performance of the STLF\nmodel, indicating that simply adopting FL is not enough. In this paper, we\npropose a DRL-assisted FL approach, DEfect-AwaRe federated soft actor-critic\n(DearFSAC), to robustly train an accurate STLF model for PPs to forecast\nprecise short-term utility electricity demand. Firstly. we design a STLF model\nbased on long short-term memory (LSTM) using just historical load data and time\ndata. Furthermore, considering the uncertainty of defects occurrence, a deep\nreinforcement learning (DRL) algorithm is adopted to assist FL by alleviating\nmodel degradation caused by defects. In addition, for faster convergence of FL\ntraining, an auto-encoder is designed for both dimension reduction and quality\nevaluation of uploaded models. In the simulations, we validate our approach on\nreal data of Helsinki's UCs in 2019. The results show that DearFSAC outperforms\nall the other approaches no matter if defects occur or not.\n","authors":["Chenghao Huang","Weilong Chen","Shengrong Bu","Yanru Zhang"],"pdf_url":"https://arxiv.org/pdf/2206.11715v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2009.13370v5","updated":"2023-07-25T16:15:59Z","published":"2020-09-28T14:38:52Z","title":"Replica Analysis of the Linear Model with Markov or Hidden Markov Signal\n  Priors","summary":"  This paper estimates free energy, average mutual information, and minimum\nmean square error (MMSE) of a linear model under two assumptions: (1) the\nsource is generated by a Markov chain, (2) the source is generated via a hidden\nMarkov model. Our estimates are based on the replica method in statistical\nphysics. We show that under the posterior mean estimator, the linear model with\nMarkov sources or hidden Markov sources is decoupled into single-input AWGN\nchannels with state information available at both encoder and decoder where the\nstate distribution follows the left Perron-Frobenius eigenvector with unit\nManhattan norm of the stochastic matrix of Markov chains. Numerical results\nshow that the free energies and MSEs obtained via the replica method are\nclosely approximate to their counterparts achieved by the Metropolis-Hastings\nalgorithm or some well-known approximate message passing algorithms in the\nresearch literature.\n","authors":["Lan V. Truong"],"pdf_url":"https://arxiv.org/pdf/2009.13370v5.pdf","comment":"A shorter version to appear in IEEE Transactions on Information\n  Theory (accepted in July 2023)"},{"id":"http://arxiv.org/abs/2303.13808v2","updated":"2023-07-25T16:12:01Z","published":"2023-03-24T05:05:01Z","title":"marl-jax: Multi-Agent Reinforcement Leaning Framework","summary":"  Recent advances in Reinforcement Learning (RL) have led to many exciting\napplications. These advancements have been driven by improvements in both\nalgorithms and engineering, which have resulted in faster training of RL\nagents. We present marl-jax, a multi-agent reinforcement learning software\npackage for training and evaluating social generalization of the agents. The\npackage is designed for training a population of agents in multi-agent\nenvironments and evaluating their ability to generalize to diverse background\nagents. It is built on top of DeepMind's JAX ecosystem~\\cite{deepmind2020jax}\nand leverages the RL ecosystem developed by DeepMind. Our framework marl-jax is\ncapable of working in cooperative and competitive, simultaneous-acting\nenvironments with multiple agents. The package offers an intuitive and\nuser-friendly command-line interface for training a population and evaluating\nits generalization capabilities. In conclusion, marl-jax provides a valuable\nresource for researchers interested in exploring social generalization in the\ncontext of MARL. The open-source code for marl-jax is available at:\n\\href{https://github.com/kinalmehta/marl-jax}{https://github.com/kinalmehta/marl-jax}\n","authors":["Kinal Mehta","Anuj Mahajan","Pawan Kumar"],"pdf_url":"https://arxiv.org/pdf/2303.13808v2.pdf","comment":"Accepted at ECML-PKDD 2023 Demo Track"},{"id":"http://arxiv.org/abs/2303.05101v2","updated":"2023-07-25T15:51:16Z","published":"2023-03-09T08:20:28Z","title":"Scalable Stochastic Gradient Riemannian Langevin Dynamics in\n  Non-Diagonal Metrics","summary":"  Stochastic-gradient sampling methods are often used to perform Bayesian\ninference on neural networks. It has been observed that the methods in which\nnotions of differential geometry are included tend to have better performances,\nwith the Riemannian metric improving posterior exploration by accounting for\nthe local curvature. However, the existing methods often resort to simple\ndiagonal metrics to remain computationally efficient. This loses some of the\ngains. We propose two non-diagonal metrics that can be used in\nstochastic-gradient samplers to improve convergence and exploration but have\nonly a minor computational overhead over diagonal metrics. We show that for\nfully connected neural networks (NNs) with sparsity-inducing priors and\nconvolutional NNs with correlated priors, using these metrics can provide\nimprovements. For some other choices the posterior is sufficiently easy also\nfor the simpler metrics.\n","authors":["Hanlin Yu","Marcelo Hartmann","Bernardo Williams","Arto Klami"],"pdf_url":"https://arxiv.org/pdf/2303.05101v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13592v1","updated":"2023-07-25T15:49:25Z","published":"2023-07-25T15:49:25Z","title":"Multi-GPU Approach for Training of Graph ML Models on large CFD Meshes","summary":"  Mesh-based numerical solvers are an important part in many design tool\nchains. However, accurate simulations like computational fluid dynamics are\ntime and resource consuming which is why surrogate models are employed to\nspeed-up the solution process. Machine Learning based surrogate models on the\nother hand are fast in predicting approximate solutions but often lack\naccuracy. Thus, the development of the predictor in a predictor-corrector\napproach is the focus here, where the surrogate model predicts a flow field and\nthe numerical solver corrects it. This paper scales a state-of-the-art\nsurrogate model from the domain of graph-based machine learning to\nindustry-relevant mesh sizes of a numerical flow simulation. The approach\npartitions and distributes the flow domain to multiple GPUs and provides halo\nexchange between these partitions during training. The utilized graph neural\nnetwork operates directly on the numerical mesh and is able to preserve complex\ngeometries as well as all other properties of the mesh. The proposed surrogate\nmodel is evaluated with an application on a three dimensional turbomachinery\nsetup and compared to a traditionally trained distributed model. The results\nshow that the traditional approach produces superior predictions and\noutperforms the proposed surrogate model. Possible explanations, improvements\nand future directions are outlined.\n","authors":["Sebastian Strönisch","Maximilian Sander","Andreas Knüpfer","Marcus Meyer"],"pdf_url":"https://arxiv.org/pdf/2307.13592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13586v1","updated":"2023-07-25T15:42:11Z","published":"2023-07-25T15:42:11Z","title":"Settling the Sample Complexity of Online Reinforcement Learning","summary":"  A central issue lying at the heart of online reinforcement learning (RL) is\ndata efficiency. While a number of recent works achieved asymptotically minimal\nregret in online RL, the optimality of these results is only guaranteed in a\n``large-sample'' regime, imposing enormous burn-in cost in order for their\nalgorithms to operate optimally. How to achieve minimax-optimal regret without\nincurring any burn-in cost has been an open problem in RL theory.\n  We settle this problem for the context of finite-horizon inhomogeneous Markov\ndecision processes. Specifically, we prove that a modified version of Monotonic\nValue Propagation (MVP), a model-based algorithm proposed by\n\\cite{zhang2020reinforcement}, achieves a regret on the order of (modulo log\nfactors) \\begin{equation*}\n  \\min\\big\\{ \\sqrt{SAH^3K}, \\,HK \\big\\}, \\end{equation*} where $S$ is the\nnumber of states, $A$ is the number of actions, $H$ is the planning horizon,\nand $K$ is the total number of episodes. This regret matches the minimax lower\nbound for the entire range of sample size $K\\geq 1$, essentially eliminating\nany burn-in requirement. It also translates to a PAC sample complexity (i.e.,\nthe number of episodes needed to yield $\\varepsilon$-accuracy) of\n$\\frac{SAH^3}{\\varepsilon^2}$ up to log factor, which is minimax-optimal for\nthe full $\\varepsilon$-range.\n  Further, we extend our theory to unveil the influences of problem-dependent\nquantities like the optimal value/cost and certain variances. The key technical\ninnovation lies in the development of a new regret decomposition strategy and a\nnovel analysis paradigm to decouple complicated statistical dependency -- a\nlong-standing challenge facing the analysis of online RL in the sample-hungry\nregime.\n","authors":["Zihan Zhang","Yuxin Chen","Jason D. Lee","Simon S. Du"],"pdf_url":"https://arxiv.org/pdf/2307.13586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13581v1","updated":"2023-07-25T15:35:13Z","published":"2023-07-25T15:35:13Z","title":"Comparing Forward and Inverse Design Paradigms: A Case Study on\n  Refractory High-Entropy Alloys","summary":"  The rapid design of advanced materials is a topic of great scientific\ninterest. The conventional, ``forward'' paradigm of materials design involves\nevaluating multiple candidates to determine the best candidate that matches the\ntarget properties. However, recent advances in the field of deep learning have\ngiven rise to the possibility of an ``inverse'' design paradigm for advanced\nmaterials, wherein a model provided with the target properties is able to find\nthe best candidate. Being a relatively new concept, there remains a need to\nsystematically evaluate how these two paradigms perform in practical\napplications. Therefore, the objective of this study is to directly,\nquantitatively compare the forward and inverse design modeling paradigms. We do\nso by considering two case studies of refractory high-entropy alloy design with\ndifferent objectives and constraints and comparing the inverse design method to\nother forward schemes like localized forward search, high throughput screening,\nand multi objective optimization.\n","authors":["Arindam Debnath","Lavanya Raman","Wenjie Li","Adam M. Krajewski","Marcia Ahn","Shuang Lin","Shunli Shang","Allison M. Beese","Zi-Kui Liu","Wesley F. Reinhart"],"pdf_url":"https://arxiv.org/pdf/2307.13581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13579v1","updated":"2023-07-25T15:33:38Z","published":"2023-07-25T15:33:38Z","title":"Reinterpreting survival analysis in the universal approximator age","summary":"  Survival analysis is an integral part of the statistical toolbox. However,\nwhile most domains of classical statistics have embraced deep learning,\nsurvival analysis only recently gained some minor attention from the deep\nlearning community. This recent development is likely in part motivated by the\nCOVID-19 pandemic. We aim to provide the tools needed to fully harness the\npotential of survival analysis in deep learning. On the one hand, we discuss\nhow survival analysis connects to classification and regression. On the other\nhand, we provide technical tools. We provide a new loss function, evaluation\nmetrics, and the first universal approximating network that provably produces\nsurvival curves without numeric integration. We show that the loss function and\nmodel outperform other approaches using a large numerical study.\n","authors":["Sören Dittmer","Michael Roberts","Jacobus Preller","AIX COVNET","James H. F. Rudd","John A. D. Aston","Carola-Bibiane Schönlieb"],"pdf_url":"https://arxiv.org/pdf/2307.13579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13571v1","updated":"2023-07-25T15:23:15Z","published":"2023-07-25T15:23:15Z","title":"PT$\\mathrm{L}^{p}$: Partial Transport $\\mathrm{L}^{p}$ Distances","summary":"  Optimal transport and its related problems, including optimal partial\ntransport, have proven to be valuable tools in machine learning for computing\nmeaningful distances between probability or positive measures. This success has\nled to a growing interest in defining transport-based distances that allow for\ncomparing signed measures and, more generally, multi-channeled signals.\nTransport $\\mathrm{L}^{p}$ distances are notable extensions of the optimal\ntransport framework to signed and possibly multi-channeled signals. In this\npaper, we introduce partial transport $\\mathrm{L}^{p}$ distances as a new\nfamily of metrics for comparing generic signals, benefiting from the robustness\nof partial transport distances. We provide theoretical background such as the\nexistence of optimal plans and the behavior of the distance in various limits.\nFurthermore, we introduce the sliced variation of these distances, which allows\nfor rapid comparison of generic signals. Finally, we demonstrate the\napplication of the proposed distances in signal class separability and nearest\nneighbor classification.\n","authors":["Xinran Liu","Yikun Bai","Huy Tran","Zhanqi Zhu","Matthew Thorpe","Soheil Kolouri"],"pdf_url":"https://arxiv.org/pdf/2307.13571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07343v2","updated":"2023-07-25T15:20:12Z","published":"2023-07-14T13:46:35Z","title":"MaxMin-L2-SVC-NCH: A Novel Approach for Support Vector Classifier\n  Training and Parameter Selection","summary":"  The selection of Gaussian kernel parameters plays an important role in the\napplications of support vector classification (SVC). A commonly used method is\nthe k-fold cross validation with grid search (CV), which is extremely\ntime-consuming because it needs to train a large number of SVC models. In this\npaper, a new approach is proposed to train SVC and optimize the selection of\nGaussian kernel parameters. We first formulate the training and parameter\nselection of SVC as a minimax optimization problem named as MaxMin-L2-SVC-NCH,\nin which the minimization problem is an optimization problem of finding the\nclosest points between two normal convex hulls (L2-SVC-NCH) while the\nmaximization problem is an optimization problem of finding the optimal Gaussian\nkernel parameters. A lower time complexity can be expected in MaxMin-L2-SVC-NCH\nbecause CV is not needed. We then propose a projected gradient algorithm (PGA)\nfor training L2-SVC-NCH. The famous sequential minimal optimization (SMO)\nalgorithm is a special case of the PGA. Thus, the PGA can provide more\nflexibility than the SMO. Furthermore, the solution of the maximization problem\nis done by a gradient ascent algorithm with dynamic learning rate. The\ncomparative experiments between MaxMin-L2-SVC-NCH and the previous best\napproaches on public datasets show that MaxMin-L2-SVC-NCH greatly reduces the\nnumber of models to be trained while maintaining competitive test accuracy.\nThese findings indicate that MaxMin-L2-SVC-NCH is a better choice for SVC\ntasks.\n","authors":["Linkai Luo","Qiaoling Yang","Hong Peng","Yiding Wang","Ziyang Chen"],"pdf_url":"https://arxiv.org/pdf/2307.07343v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13565v1","updated":"2023-07-25T15:17:31Z","published":"2023-07-25T15:17:31Z","title":"Decision-Focused Learning: Foundations, State of the Art, Benchmark and\n  Future Opportunities","summary":"  Decision-focused learning (DFL) is an emerging paradigm in machine learning\nwhich trains a model to optimize decisions, integrating prediction and\noptimization in an end-to-end system. This paradigm holds the promise to\nrevolutionize decision-making in many real-world applications which operate\nunder uncertainty, where the estimation of unknown parameters within these\ndecision models often becomes a substantial roadblock. This paper presents a\ncomprehensive review of DFL. It provides an in-depth analysis of the various\ntechniques devised to integrate machine learning and optimization models\nintroduces a taxonomy of DFL methods distinguished by their unique\ncharacteristics, and conducts an extensive empirical evaluation of these\nmethods proposing suitable benchmark dataset and tasks for DFL. Finally, the\nstudy provides valuable insights into current and potential future avenues in\nDFL research.\n","authors":["Jayanta Mandi","James Kotary","Senne Berden","Maxime Mulamba","Victor Bucarey","Tias Guns","Ferdinando Fioretto"],"pdf_url":"https://arxiv.org/pdf/2307.13565v1.pdf","comment":"Experimental Survey and Benchmarking"},{"id":"http://arxiv.org/abs/2307.12204v2","updated":"2023-07-25T15:16:40Z","published":"2023-07-23T02:18:30Z","title":"Adversarial Agents For Attacking Inaudible Voice Activated Devices","summary":"  The paper applies reinforcement learning to novel Internet of Thing\nconfigurations. Our analysis of inaudible attacks on voice-activated devices\nconfirms the alarming risk factor of 7.6 out of 10, underlining significant\nsecurity vulnerabilities scored independently by NIST National Vulnerability\nDatabase (NVD). Our baseline network model showcases a scenario in which an\nattacker uses inaudible voice commands to gain unauthorized access to\nconfidential information on a secured laptop. We simulated many attack\nscenarios on this baseline network model, revealing the potential for mass\nexploitation of interconnected devices to discover and own privileged\ninformation through physical access without adding new hardware or amplifying\ndevice skills. Using Microsoft's CyberBattleSim framework, we evaluated six\nreinforcement learning algorithms and found that Deep-Q learning with\nexploitation proved optimal, leading to rapid ownership of all nodes in fewer\nsteps. Our findings underscore the critical need for understanding\nnon-conventional networks and new cybersecurity measures in an ever-expanding\ndigital landscape, particularly those characterized by mobile devices, voice\nactivation, and non-linear microphones susceptible to malicious actors\noperating stealth attacks in the near-ultrasound or inaudible ranges. By 2024,\nthis new attack surface might encompass more digital voice assistants than\npeople on the planet yet offer fewer remedies than conventional patching or\nfirmware fixes since the inaudible attacks arise inherently from the microphone\ndesign and digital signal processing.\n","authors":["Forrest McKee","David Noever"],"pdf_url":"https://arxiv.org/pdf/2307.12204v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05747v2","updated":"2023-07-25T15:16:33Z","published":"2023-07-08T14:14:55Z","title":"Integrating Curricula with Replays: Its Effects on Continual Learning","summary":"  Humans engage in learning and reviewing processes with curricula when\nacquiring new skills or knowledge. This human learning behavior has inspired\nthe integration of curricula with replay methods in continual learning agents.\nThe goal is to emulate the human learning process, thereby improving knowledge\nretention and facilitating learning transfer. Existing replay methods in\ncontinual learning agents involve the random selection and ordering of data\nfrom previous tasks, which has shown to be effective. However, limited research\nhas explored the integration of different curricula with replay methods to\nenhance continual learning. Our study takes initial steps in examining the\nimpact of integrating curricula with replay methods on continual learning in\nthree specific aspects: the interleaved frequency of replayed exemplars with\ntraining data, the sequence in which exemplars are replayed, and the strategy\nfor selecting exemplars into the replay buffer. These aspects of curricula\ndesign align with cognitive psychology principles and leverage the benefits of\ninterleaved practice during replays, easy-to-hard rehearsal, and exemplar\nselection strategy involving exemplars from a uniform distribution of\ndifficulties. Based on our results, these three curricula effectively mitigated\ncatastrophic forgetting and enhanced positive knowledge transfer, demonstrating\nthe potential of curricula in advancing continual learning methodologies. Our\ncode and data are available:\nhttps://github.com/ZhangLab-DeepNeuroCogLab/Integrating-Curricula-with-Replays\n","authors":["Ren Jie Tee","Mengmi Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.05747v2.pdf","comment":"8 pages, 6 figures, accepted in AAAI Summer Symposium Series\n  Proceedings"},{"id":"http://arxiv.org/abs/2307.08303v2","updated":"2023-07-25T14:57:05Z","published":"2023-07-17T07:55:47Z","title":"Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language\n  Models","summary":"  Dense retrieval (DR) converts queries and documents into dense embeddings and\nmeasures the similarity between queries and documents in vector space. One of\nthe challenges in DR is the lack of domain-specific training data. While DR\nmodels can learn from large-scale public datasets like MS MARCO through\ntransfer learning, evidence shows that not all DR models and domains can\nbenefit from transfer learning equally. Recently, some researchers have\nresorted to large language models (LLMs) to improve the zero-shot and few-shot\nDR models. However, the hard prompts or human-written prompts utilized in these\nworks cannot guarantee the good quality of generated weak queries. To tackle\nthis, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,\nwe leverage soft prompt-tuning to optimize a task-specific soft prompt on\nlimited ground truth data and then prompt the LLMs to tag unlabeled documents\nwith weak queries, yielding enough weak document-query pairs to train\ntask-specific dense retrievers. We design a filter to select high-quality\nexample document-query pairs in the prompt to further improve the quality of\nweak tagged queries. To the best of our knowledge, there is no prior work\nutilizing soft prompt tuning to augment DR models. The experiments demonstrate\nthat SPTAR outperforms the unsupervised baselines BM25 and the recently\nproposed LLMs-based augmentation method for DR.\n","authors":["Zhiyuan Peng","Xuyang Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2307.08303v2.pdf","comment":"fix typo InPairs which should be InPars"},{"id":"http://arxiv.org/abs/2307.13548v1","updated":"2023-07-25T14:51:01Z","published":"2023-07-25T14:51:01Z","title":"Node Injection Link Stealing Attack","summary":"  In this paper, we present a stealthy and effective attack that exposes\nprivacy vulnerabilities in Graph Neural Networks (GNNs) by inferring private\nlinks within graph-structured data. Focusing on the inductive setting where new\nnodes join the graph and an API is used to query predictions, we investigate\nthe potential leakage of private edge information. We also propose methods to\npreserve privacy while maintaining model utility. Our attack demonstrates\nsuperior performance in inferring the links compared to the state of the art.\nFurthermore, we examine the application of differential privacy (DP) mechanisms\nto mitigate the impact of our proposed attack, we analyze the trade-off between\nprivacy preservation and model utility. Our work highlights the privacy\nvulnerabilities inherent in GNNs, underscoring the importance of developing\nrobust privacy-preserving mechanisms for their application.\n","authors":["Oualid Zari","Javier Parra-Arnau","Ayşe Ünsal","Melek Önen"],"pdf_url":"https://arxiv.org/pdf/2307.13548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13546v1","updated":"2023-07-25T14:48:54Z","published":"2023-07-25T14:48:54Z","title":"Transfer Learning for Portfolio Optimization","summary":"  In this work, we explore the possibility of utilizing transfer learning\ntechniques to address the financial portfolio optimization problem. We\nintroduce a novel concept called \"transfer risk\", within the optimization\nframework of transfer learning. A series of numerical experiments are conducted\nfrom three categories: cross-continent transfer, cross-sector transfer, and\ncross-frequency transfer. In particular, 1. a strong correlation between the\ntransfer risk and the overall performance of transfer learning methods is\nestablished, underscoring the significance of transfer risk as a viable\nindicator of \"transferability\"; 2. transfer risk is shown to provide a\ncomputationally efficient way to identify appropriate source tasks in transfer\nlearning, enhancing the efficiency and effectiveness of the transfer learning\napproach; 3. additionally, the numerical experiments offer valuable new\ninsights for portfolio management across these different settings.\n","authors":["Haoyang Cao","Haotian Gu","Xin Guo","Mathieu Rosenbaum"],"pdf_url":"https://arxiv.org/pdf/2307.13546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13544v1","updated":"2023-07-25T14:47:36Z","published":"2023-07-25T14:47:36Z","title":"A model for efficient dynamical ranking in networks","summary":"  We present a physics-inspired method for inferring dynamic rankings in\ndirected temporal networks - networks in which each directed and timestamped\nedge reflects the outcome and timing of a pairwise interaction. The inferred\nranking of each node is real-valued and varies in time as each new edge,\nencoding an outcome like a win or loss, raises or lowers the node's estimated\nstrength or prestige, as is often observed in real scenarios including\nsequences of games, tournaments, or interactions in animal hierarchies. Our\nmethod works by solving a linear system of equations and requires only one\nparameter to be tuned. As a result, the corresponding algorithm is scalable and\nefficient. We test our method by evaluating its ability to predict interactions\n(edges' existence) and their outcomes (edges' directions) in a variety of\napplications, including both synthetic and real data. Our analysis shows that\nin many cases our method's performance is better than existing methods for\npredicting dynamic rankings and interaction outcomes.\n","authors":["Andrea Della Vecchia","Kibidi Neocosmos","Daniel B. Larremore","Cristopher Moore","Caterina De Bacco"],"pdf_url":"https://arxiv.org/pdf/2307.13544v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2303.17566v3","updated":"2023-07-25T14:42:41Z","published":"2023-03-30T17:30:42Z","title":"Non-Invasive Fairness in Learning through the Lens of Data Drift","summary":"  Machine Learning (ML) models are widely employed to drive many modern data\nsystems. While they are undeniably powerful tools, ML models often demonstrate\nimbalanced performance and unfair behaviors. The root of this problem often\nlies in the fact that different subpopulations commonly display divergent\ntrends: as a learning algorithm tries to identify trends in the data, it\nnaturally favors the trends of the majority groups, leading to a model that\nperforms poorly and unfairly for minority populations. Our goal is to improve\nthe fairness and trustworthiness of ML models by applying only non-invasive\ninterventions, i.e., without altering the data or the learning algorithm. We\nuse a simple but key insight: the divergence of trends between different\npopulations, and, consecutively, between a learned model and minority\npopulations, is analogous to data drift, which indicates the poor conformance\nbetween parts of the data and the trained model. We explore two strategies\n(model-splitting and reweighing) to resolve this drift, aiming to improve the\noverall conformance of models to the underlying data. Both our methods\nintroduce novel ways to employ the recently-proposed data profiling primitive\nof Conformance Constraints. Our experimental evaluation over 7 real-world\ndatasets shows that both DifFair and ConFair improve the fairness of ML models.\nWe demonstrate scenarios where DifFair has an edge, though ConFair has the\ngreatest practical impact and outperforms other baselines. Moreover, as a\nmodel-agnostic technique, ConFair stays robust when used against different\nmodels than the ones on which the weights have been learned, which is not the\ncase for other state of the art.\n","authors":["Ke Yang","Alexandra Meliou"],"pdf_url":"https://arxiv.org/pdf/2303.17566v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.09932v5","updated":"2023-07-25T14:41:05Z","published":"2022-01-24T19:47:10Z","title":"Learning Optimal Fair Classification Trees: Trade-offs Between\n  Interpretability, Fairness, and Accuracy","summary":"  The increasing use of machine learning in high-stakes domains -- where\npeople's livelihoods are impacted -- creates an urgent need for interpretable,\nfair, and highly accurate algorithms. With these needs in mind, we propose a\nmixed integer optimization (MIO) framework for learning optimal classification\ntrees -- one of the most interpretable models -- that can be augmented with\narbitrary fairness constraints. In order to better quantify the \"price of\ninterpretability\", we also propose a new measure of model interpretability\ncalled decision complexity that allows for comparisons across different classes\nof machine learning models. We benchmark our method against state-of-the-art\napproaches for fair classification on popular datasets; in doing so, we conduct\none of the first comprehensive analyses of the trade-offs between\ninterpretability, fairness, and predictive accuracy. Given a fixed disparity\nthreshold, our method has a price of interpretability of about 4.2 percentage\npoints in terms of out-of-sample accuracy compared to the best performing,\ncomplex models. However, our method consistently finds decisions with almost\nfull parity, while other methods rarely do.\n","authors":["Nathanael Jo","Sina Aghaei","Andrés Gómez","Phebe Vayanos"],"pdf_url":"https://arxiv.org/pdf/2201.09932v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13539v1","updated":"2023-07-25T14:40:11Z","published":"2023-07-25T14:40:11Z","title":"Model Calibration in Dense Classification with Adaptive Label\n  Perturbation","summary":"  For safety-related applications, it is crucial to produce trustworthy deep\nneural networks whose prediction is associated with confidence that can\nrepresent the likelihood of correctness for subsequent decision-making.\nExisting dense binary classification models are prone to being over-confident.\nTo improve model calibration, we propose Adaptive Stochastic Label Perturbation\n(ASLP) which learns a unique label perturbation level for each training image.\nASLP employs our proposed Self-Calibrating Binary Cross Entropy (SC-BCE) loss,\nwhich unifies label perturbation processes including stochastic approaches\n(like DisturbLabel), and label smoothing, to correct calibration while\nmaintaining classification rates. ASLP follows Maximum Entropy Inference of\nclassic statistical mechanics to maximise prediction entropy with respect to\nmissing information. It performs this while: (1) preserving classification\naccuracy on known data as a conservative solution, or (2) specifically improves\nmodel calibration degree by minimising the gap between the prediction accuracy\nand expected confidence of the target training label. Extensive results\ndemonstrate that ASLP can significantly improve calibration degrees of dense\nbinary classification models on both in-distribution and out-of-distribution\ndata. The code is available on https://github.com/Carlisle-Liu/ASLP.\n","authors":["Jiawei Liu","Changkun Ye","Shan Wang","Ruikai Cui","Jing Zhang","Kaihao Zhang","Nick Barnes"],"pdf_url":"https://arxiv.org/pdf/2307.13539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13538v1","updated":"2023-07-25T14:35:55Z","published":"2023-07-25T14:35:55Z","title":"INFINITY: Neural Field Modeling for Reynolds-Averaged Navier-Stokes\n  Equations","summary":"  For numerical design, the development of efficient and accurate surrogate\nmodels is paramount. They allow us to approximate complex physical phenomena,\nthereby reducing the computational burden of direct numerical simulations. We\npropose INFINITY, a deep learning model that utilizes implicit neural\nrepresentations (INRs) to address this challenge. Our framework encodes\ngeometric information and physical fields into compact representations and\nlearns a mapping between them to infer the physical fields. We use an airfoil\ndesign optimization problem as an example task and we evaluate our approach on\nthe challenging AirfRANS dataset, which closely resembles real-world industrial\nuse-cases. The experimental results demonstrate that our framework achieves\nstate-of-the-art performance by accurately inferring physical fields throughout\nthe volume and surface. Additionally we demonstrate its applicability in\ncontexts such as design exploration and shape optimization: our model can\ncorrectly predict drag and lift coefficients while adhering to the equations.\n","authors":["Louis Serrano","Leon Migus","Yuan Yin","Jocelyn Ahmed Mazari","Patrick Gallinari"],"pdf_url":"https://arxiv.org/pdf/2307.13538v1.pdf","comment":"ICML 2023 Workshop on Synergy of Scientific and Machine Learning\n  Modeling"},{"id":"http://arxiv.org/abs/2307.02150v3","updated":"2023-07-25T14:32:41Z","published":"2023-07-05T09:46:41Z","title":"Harmonizing Feature Attributions Across Deep Learning Architectures:\n  Enhancing Interpretability and Consistency","summary":"  Ensuring the trustworthiness and interpretability of machine learning models\nis critical to their deployment in real-world applications. Feature attribution\nmethods have gained significant attention, which provide local explanations of\nmodel predictions by attributing importance to individual input features. This\nstudy examines the generalization of feature attributions across various deep\nlearning architectures, such as convolutional neural networks (CNNs) and vision\ntransformers. We aim to assess the feasibility of utilizing a feature\nattribution method as a future detector and examine how these features can be\nharmonized across multiple models employing distinct architectures but trained\non the same data distribution. By exploring this harmonization, we aim to\ndevelop a more coherent and optimistic understanding of feature attributions,\nenhancing the consistency of local explanations across diverse deep-learning\nmodels. Our findings highlight the potential for harmonized feature attribution\nmethods to improve interpretability and foster trust in machine learning\napplications, regardless of the underlying architecture.\n","authors":["Md Abdul Kadir","Gowtham Krishna Addluri","Daniel Sonntag"],"pdf_url":"https://arxiv.org/pdf/2307.02150v3.pdf","comment":"This version of the contribution has been submitted in KI2023"},{"id":"http://arxiv.org/abs/2207.08012v2","updated":"2023-07-25T14:30:20Z","published":"2022-07-16T20:37:46Z","title":"Meta-Referential Games to Learn Compositional Learning Behaviours","summary":"  Human beings use compositionality to generalise from past experiences to\nnovel experiences. We assume a separation of our experiences into fundamental\natomic components that can be recombined in novel ways to support our ability\nto engage with novel experiences. We frame this as the ability to learn to\ngeneralise compositionally, and we will refer to behaviours making use of this\nability as compositional learning behaviours (CLBs). A central problem to\nlearning CLBs is the resolution of a binding problem (BP). While it is another\nfeat of intelligence that human beings perform with ease, it is not the case\nfor state-of-the-art artificial agents. Thus, in order to build artificial\nagents able to collaborate with human beings, we propose to develop a novel\nbenchmark to investigate agents' abilities to exhibit CLBs by solving a\ndomain-agnostic version of the BP. We take inspiration from the language\nemergence and grounding framework of referential games and propose a\nmeta-learning extension of referential games, entitled Meta-Referential Games,\nand use this framework to build our benchmark, that we name Symbolic Behaviour\nBenchmark (S2B). We provide baseline results showing that our benchmark is a\ncompelling challenge that we hope will spur the research community towards\ndeveloping more capable artificial agents.\n","authors":["Kevin Denamganaï","Sondess Missaoui","James Alfred Walker"],"pdf_url":"https://arxiv.org/pdf/2207.08012v2.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2307.13535v1","updated":"2023-07-25T14:30:00Z","published":"2023-07-25T14:30:00Z","title":"Do algorithms and barriers for sparse principal component analysis\n  extend to other structured settings?","summary":"  We study a principal component analysis problem under the spiked Wishart\nmodel in which the structure in the signal is captured by a class of\nunion-of-subspace models. This general class includes vanilla sparse PCA as\nwell as its variants with graph sparsity. With the goal of studying these\nproblems under a unified statistical and computational lens, we establish\nfundamental limits that depend on the geometry of the problem instance, and\nshow that a natural projected power method exhibits local convergence to the\nstatistically near-optimal neighborhood of the solution. We complement these\nresults with end-to-end analyses of two important special cases given by path\nand tree sparsity in a general basis, showing initialization methods and\nmatching evidence of computational hardness. Overall, our results indicate that\nseveral of the phenomena observed for vanilla sparse PCA extend in a natural\nfashion to its structured counterparts.\n","authors":["Guanyi Wang","Mengqi Lou","Ashwin Pananjady"],"pdf_url":"https://arxiv.org/pdf/2307.13535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13533v1","updated":"2023-07-25T14:27:49Z","published":"2023-07-25T14:27:49Z","title":"Differentiable Turbulence II","summary":"  Differentiable fluid simulators are increasingly demonstrating value as\nuseful tools for developing data-driven models in computational fluid dynamics\n(CFD). Differentiable turbulence, or the end-to-end training of machine\nlearning (ML) models embedded in CFD solution algorithms, captures both the\ngeneralization power and limited upfront cost of physics-based simulations, and\nthe flexibility and automated training of deep learning methods. We develop a\nframework for integrating deep learning models into a generic finite element\nnumerical scheme for solving the Navier-Stokes equations, applying the\ntechnique to learn a sub-grid scale closure using a multi-scale graph neural\nnetwork. We demonstrate the method on several realizations of flow over a\nbackwards-facing step, testing on both unseen Reynolds numbers and new\ngeometry. We show that the learned closure can achieve accuracy comparable to\ntraditional large eddy simulation on a finer grid that amounts to an equivalent\nspeedup of 10x. As the desire and need for cheaper CFD simulations grows, we\nsee hybrid physics-ML methods as a path forward to be exploited in the near\nfuture.\n","authors":["Varun Shankar","Romit Maulik","Venkatasubramanian Viswanathan"],"pdf_url":"https://arxiv.org/pdf/2307.13533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13517v1","updated":"2023-07-25T14:09:53Z","published":"2023-07-25T14:09:53Z","title":"Towards Long-Term predictions of Turbulence using Neural Operators","summary":"  This paper explores Neural Operators to predict turbulent flows, focusing on\nthe Fourier Neural Operator (FNO) model. It aims to develop\nreduced-order/surrogate models for turbulent flow simulations using Machine\nLearning. Different model configurations are analyzed, with U-NET structures\n(UNO and U-FNET) performing better than the standard FNO in accuracy and\nstability. U-FNET excels in predicting turbulence at higher Reynolds numbers.\nRegularization terms, like gradient and stability losses, are essential for\nstable and accurate predictions. The study emphasizes the need for improved\nmetrics for deep learning models in fluid flow prediction. Further research\nshould focus on models handling complex flows and practical benchmarking\nmetrics.\n","authors":["Fernando Gonzalez","François-Xavier Demoulin","Simon Bernard"],"pdf_url":"https://arxiv.org/pdf/2307.13517v1.pdf","comment":"ETMM14 proceedings"},{"id":"http://arxiv.org/abs/2304.14108v4","updated":"2023-07-25T14:07:03Z","published":"2023-04-27T11:37:18Z","title":"DataComp: In search of the next generation of multimodal datasets","summary":"  Multimodal datasets are a critical component in recent breakthroughs such as\nStable Diffusion and GPT-4, yet their design does not receive the same research\nattention as model architectures or training algorithms. To address this\nshortcoming in the ML ecosystem, we introduce DataComp, a testbed for dataset\nexperiments centered around a new candidate pool of 12.8 billion image-text\npairs from Common Crawl. Participants in our benchmark design new filtering\ntechniques or curate new data sources and then evaluate their new dataset by\nrunning our standardized CLIP training code and testing the resulting model on\n38 downstream test sets. Our benchmark consists of multiple compute scales\nspanning four orders of magnitude, which enables the study of scaling trends\nand makes the benchmark accessible to researchers with varying resources. Our\nbaseline experiments show that the DataComp workflow leads to better training\nsets. In particular, our best baseline, DataComp-1B, enables training a CLIP\nViT-L/14 from scratch to 79.2% zero-shot accuracy on ImageNet, outperforming\nOpenAI's CLIP ViT-L/14 by 3.7 percentage points while using the same training\nprocedure and compute. We release DataComp and all accompanying code at\nwww.datacomp.ai.\n","authors":["Samir Yitzhak Gadre","Gabriel Ilharco","Alex Fang","Jonathan Hayase","Georgios Smyrnis","Thao Nguyen","Ryan Marten","Mitchell Wortsman","Dhruba Ghosh","Jieyu Zhang","Eyal Orgad","Rahim Entezari","Giannis Daras","Sarah Pratt","Vivek Ramanujan","Yonatan Bitton","Kalyani Marathe","Stephen Mussmann","Richard Vencu","Mehdi Cherti","Ranjay Krishna","Pang Wei Koh","Olga Saukh","Alexander Ratner","Shuran Song","Hannaneh Hajishirzi","Ali Farhadi","Romain Beaumont","Sewoong Oh","Alex Dimakis","Jenia Jitsev","Yair Carmon","Vaishaal Shankar","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2304.14108v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13503v1","updated":"2023-07-25T13:54:00Z","published":"2023-07-25T13:54:00Z","title":"Continuous Time Evidential Distributions for Irregular Time Series","summary":"  Prevalent in many real-world settings such as healthcare, irregular time\nseries are challenging to formulate predictions from. It is difficult to infer\nthe value of a feature at any given time when observations are sporadic, as it\ncould take on a range of values depending on when it was last observed. To\ncharacterize this uncertainty we present EDICT, a strategy that learns an\nevidential distribution over irregular time series in continuous time. This\ndistribution enables well-calibrated and flexible inference of partially\nobserved features at any time of interest, while expanding uncertainty\ntemporally for sparse, irregular observations. We demonstrate that EDICT\nattains competitive performance on challenging time series classification tasks\nand enabling uncertainty-guided inference when encountering noisy data.\n","authors":["Taylor W. Killian","Haoran Zhang","Thomas Hartvigsen","Ava P. Amini"],"pdf_url":"https://arxiv.org/pdf/2307.13503v1.pdf","comment":"ICML 2023 Workshop on Interpretable Machine Learning in Healthcare.\n  Code is available at https://github.com/twkillian/EDICT"},{"id":"http://arxiv.org/abs/2307.13501v1","updated":"2023-07-25T13:51:12Z","published":"2023-07-25T13:51:12Z","title":"Deep Reinforcement Learning for Robust Goal-Based Wealth Management","summary":"  Goal-based investing is an approach to wealth management that prioritizes\nachieving specific financial goals. It is naturally formulated as a sequential\ndecision-making problem as it requires choosing the appropriate investment\nuntil a goal is achieved. Consequently, reinforcement learning, a machine\nlearning technique appropriate for sequential decision-making, offers a\npromising path for optimizing these investment strategies. In this paper, a\nnovel approach for robust goal-based wealth management based on deep\nreinforcement learning is proposed. The experimental results indicate its\nsuperiority over several goal-based wealth management benchmarks on both\nsimulated and historical market data.\n","authors":["Tessa Bauman","Bruno Gašperov","Stjepan Begušić","Zvonko Kostanjčar"],"pdf_url":"https://arxiv.org/pdf/2307.13501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13499v1","updated":"2023-07-25T13:49:15Z","published":"2023-07-25T13:49:15Z","title":"Finding Money Launderers Using Heterogeneous Graph Neural Networks","summary":"  Current anti-money laundering (AML) systems, predominantly rule-based,\nexhibit notable shortcomings in efficiently and precisely detecting instances\nof money laundering. As a result, there has been a recent surge toward\nexploring alternative approaches, particularly those utilizing machine\nlearning. Since criminals often collaborate in their money laundering\nendeavors, accounting for diverse types of customer relations and links becomes\ncrucial. In line with this, the present paper introduces a graph neural network\n(GNN) approach to identify money laundering activities within a large\nheterogeneous network constructed from real-world bank transactions and\nbusiness role data belonging to DNB, Norway's largest bank. Specifically, we\nextend the homogeneous GNN method known as the Message Passing Neural Network\n(MPNN) to operate effectively on a heterogeneous graph. As part of this\nprocedure, we propose a novel method for aggregating messages across different\nedges of the graph. Our findings highlight the importance of using an\nappropriate GNN architecture when combining information in heterogeneous\ngraphs. The performance results of our model demonstrate great potential in\nenhancing the quality of electronic surveillance systems employed by banks to\ndetect instances of money laundering. To the best of our knowledge, this is the\nfirst published work applying GNN on a large real-world heterogeneous network\nfor anti-money laundering purposes.\n","authors":["Fredrik Johannessen","Martin Jullum"],"pdf_url":"https://arxiv.org/pdf/2307.13499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13497v1","updated":"2023-07-25T13:46:36Z","published":"2023-07-25T13:46:36Z","title":"Zshot: An Open-source Framework for Zero-Shot Named Entity Recognition\n  and Relation Extraction","summary":"  The Zero-Shot Learning (ZSL) task pertains to the identification of entities\nor relations in texts that were not seen during training. ZSL has emerged as a\ncritical research area due to the scarcity of labeled data in specific domains,\nand its applications have grown significantly in recent years. With the advent\nof large pretrained language models, several novel methods have been proposed,\nresulting in substantial improvements in ZSL performance. There is a growing\ndemand, both in the research community and industry, for a comprehensive ZSL\nframework that facilitates the development and accessibility of the latest\nmethods and pretrained models.In this study, we propose a novel ZSL framework\ncalled Zshot that aims to address the aforementioned challenges. Our primary\nobjective is to provide a platform that allows researchers to compare different\nstate-of-the-art ZSL methods with standard benchmark datasets. Additionally, we\nhave designed our framework to support the industry with readily available APIs\nfor production under the standard SpaCy NLP pipeline. Our API is extendible and\nevaluable, moreover, we include numerous enhancements such as boosting the\naccuracy with pipeline ensembling and visualization utilities available as a\nSpaCy extension.\n","authors":["Gabriele Picco","Marcos Martínez Galindo","Alberto Purpura","Leopold Fuchs","Vanessa López","Hoang Thanh Lam"],"pdf_url":"https://arxiv.org/pdf/2307.13497v1.pdf","comment":"Accepted at ACL 2023"},{"id":"http://arxiv.org/abs/2306.07350v2","updated":"2023-07-25T13:44:43Z","published":"2023-06-12T18:16:33Z","title":"G-invariant diffusion maps","summary":"  The diffusion maps embedding of data lying on a manifold have shown success\nin tasks ranging from dimensionality reduction and clustering, to data\nvisualization. In this work, we consider embedding data sets which were sampled\nfrom a manifold which is closed under the action of a continuous matrix group.\nAn example of such a data set is images who's planar rotations are arbitrary.\nThe G-invariant graph Laplacian, introduced in a previous work of the authors,\nadmits eigenfunctions in the form of tensor products between the elements of\nthe irreducible unitary representations of the group and eigenvectors of\ncertain matrices. We employ these eigenfunctions to derive diffusion maps that\nintrinsically account for the group action on the data. In particular, we\nconstruct both equivariant and invariant embeddings which can be used naturally\nto cluster and align the data points. We demonstrate the effectiveness of our\nconstruction with simulated data.\n","authors":["Eitan Rosen","Xiuyuan Cheng","Yoel Shkolnisky"],"pdf_url":"https://arxiv.org/pdf/2306.07350v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07160v2","updated":"2023-07-25T13:43:11Z","published":"2022-11-14T07:40:35Z","title":"FedTracker: Furnishing Ownership Verification and Traceability for\n  Federated Learning Model","summary":"  Federated learning (FL) is a distributed machine learning paradigm allowing\nmultiple clients to collaboratively train a global model without sharing their\nlocal data. However, FL entails exposing the model to various participants.\nThis poses a risk of unauthorized model distribution or resale by the malicious\nclient, compromising the intellectual property rights of the FL group. To deter\nsuch misbehavior, it is essential to establish a mechanism for verifying the\nownership of the model and as well tracing its origin to the leaker among the\nFL participants. In this paper, we present FedTracker, the first FL model\nprotection framework that provides both ownership verification and\ntraceability. FedTracker adopts a bi-level protection scheme consisting of\nglobal watermark mechanism and local fingerprint mechanism. The former\nauthenticates the ownership of the global model, while the latter identifies\nwhich client the model is derived from. FedTracker leverages Continual Learning\n(CL) principles to embedding the watermark in a way that preserves the utility\nof the FL model on both primitive task and watermark task. FedTracker also\ndevises a novel metric to better discriminate different fingerprints.\nExperimental results show FedTracker is effective in ownership verification,\ntraceability, and maintains good fidelity and robustness against various\nwatermark removal attacks.\n","authors":["Shuo Shao","Wenyuan Yang","Hanlin Gu","Zhan Qin","Lixin Fan","Qiang Yang","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2211.07160v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13494v1","updated":"2023-07-25T13:42:22Z","published":"2023-07-25T13:42:22Z","title":"Duet: efficient and scalable hybriD neUral rElation undersTanding","summary":"  Cardinality estimation methods based on probability distribution estimation\nhave achieved high-precision estimation results compared to traditional\nmethods. However, the most advanced methods suffer from high estimation costs\ndue to the sampling method they use when dealing with range queries. Also, such\na sampling method makes them difficult to differentiate, so the supervision\nsignal from the query workload is difficult to train the model to improve the\naccuracy of cardinality estimation. In this paper, we propose a new hybrid and\ndeterministic modeling approach (Duet) for the cardinality estimation problem\nwhich has better efficiency and scalability compared to previous approaches.\nDuet allows for direct cardinality estimation of range queries with\nsignificantly lower time and memory costs, as well as in a differentiable form.\nAs the prediction process of this approach is differentiable, we can\nincorporate queries with larger model estimation errors into the training\nprocess to address the long-tail distribution problem of model estimation\nerrors on high dimensional tables. We evaluate Duet on classical datasets and\nbenchmarks, and the results prove the effectiveness of Duet.\n","authors":["Kaixin Zhang","Hongzhi Wang","Yabin Lu","Ziqi Li","Chang Shu","Yu Yan","Donghua Yang"],"pdf_url":"https://arxiv.org/pdf/2307.13494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13484v1","updated":"2023-07-25T13:21:07Z","published":"2023-07-25T13:21:07Z","title":"Rational kernel-based interpolation for complex-valued frequency\n  response functions","summary":"  This work is concerned with the kernel-based approximation of a\ncomplex-valued function from data, where the frequency response function of a\npartial differential equation in the frequency domain is of particular\ninterest. In this setting, kernel methods are employed more and more\nfrequently, however, standard kernels do not perform well. Moreover, the role\nand mathematical implications of the underlying pair of kernels, which arises\nnaturally in the complex-valued case, remain to be addressed. We introduce new\nreproducing kernel Hilbert spaces of complex-valued functions, and formulate\nthe problem of complex-valued interpolation with a kernel pair as minimum norm\ninterpolation in these spaces. Moreover, we combine the interpolant with a\nlow-order rational function, where the order is adaptively selected based on a\nnew model selection criterion. Numerical results on examples from different\nfields, including electromagnetics and acoustic examples, illustrate the\nperformance of the method, also in comparison to available rational\napproximation methods.\n","authors":["Julien Bect","Niklas Georg","Ulrich Römer","Sebastian Schöps"],"pdf_url":"https://arxiv.org/pdf/2307.13484v1.pdf","comment":"26 pages main paper, 6 pages supplement"},{"id":"http://arxiv.org/abs/2301.13395v2","updated":"2023-07-25T13:13:14Z","published":"2023-01-31T04:03:28Z","title":"Faster Predict-and-Optimize with Davis-Yin Splitting","summary":"  In many applications, a combinatorial problem must be repeatedly solved with\nsimilar, but distinct parameters. Yet, the parameters $w$ are not directly\nobserved; only contextual data $d$ that correlates with $w$ is available. It is\ntempting to use a neural network to predict $w$ given $d$, but training such a\nmodel requires reconciling the discrete nature of combinatorial optimization\nwith the gradient-based frameworks used to train neural networks. When the\nproblem in question is an Integer Linear Program (ILP), one approach to\novercoming this issue is to consider a continuous relaxation of the\ncombinatorial problem. While existing methods utilizing this approach have\nshown to be highly effective on small problems (10-100 variables), they do not\nscale well to large problems. In this work, we draw on ideas from modern convex\noptimization to design a network and training scheme which scales effortlessly\nto problems with thousands of variables.\n","authors":["Daniel McKenzie","Samy Wu Fung","Howard Heaton"],"pdf_url":"https://arxiv.org/pdf/2301.13395v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13470v1","updated":"2023-07-25T13:01:25Z","published":"2023-07-25T13:01:25Z","title":"Combinatorial Auctions and Graph Neural Networks for Local Energy\n  Flexibility Markets","summary":"  This paper proposes a new combinatorial auction framework for local energy\nflexibility markets, which addresses the issue of prosumers' inability to\nbundle multiple flexibility time intervals. To solve the underlying NP-complete\nwinner determination problems, we present a simple yet powerful heterogeneous\ntri-partite graph representation and design graph neural network-based models.\nOur models achieve an average optimal value deviation of less than 5\\% from an\noff-the-shelf optimization tool and show linear inference time complexity\ncompared to the exponential complexity of the commercial solver. Contributions\nand results demonstrate the potential of using machine learning to efficiently\nallocate energy flexibility resources in local markets and solving optimization\nproblems in general.\n","authors":["Awadelrahman M. A. Ahmed","Frank Eliassen","Yan Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.13470v1.pdf","comment":"Accepted in The IEEE PES ISGT Europe 2023 (ISGT Europe 2023),\n  Grenoble, France, on October, 2023"},{"id":"http://arxiv.org/abs/2210.10147v2","updated":"2023-07-25T12:57:15Z","published":"2022-10-18T20:26:56Z","title":"TEFL: Turbo Explainable Federated Learning for 6G Trustworthy Zero-Touch\n  Network Slicing","summary":"  Sixth-generation (6G) networks anticipate intelligently supporting a massive\nnumber of coexisting and heterogeneous slices associated with various vertical\nuse cases. Such a context urges the adoption of artificial intelligence\n(AI)-driven zero-touch management and orchestration (MANO) of the end-to-end\n(E2E) slices under stringent service level agreements (SLAs). Specifically, the\ntrustworthiness of the AI black-boxes in real deployment can be achieved by\nexplainable AI (XAI) tools to build transparency between the interacting actors\nin the slicing ecosystem, such as tenants, infrastructure providers and\noperators. Inspired by the turbo principle, this paper presents a novel\niterative explainable federated learning (FL) approach where a constrained\nresource allocation model and an \\emph{explainer} exchange -- in a closed loop\n(CL) fashion -- soft attributions of the features as well as inference\npredictions to achieve a transparent and SLA-aware zero-touch service\nmanagement (ZSM) of 6G network slices at RAN-Edge setup under non-independent\nidentically distributed (non-IID) datasets. In particular, we quantitatively\nvalidate the faithfulness of the explanations via the so-called\nattribution-based \\emph{confidence metric} that is included as a constraint in\nthe run-time FL optimization task. In this respect, Integrated-Gradient (IG) as\nwell as Input $\\times$ Gradient and SHAP are used to generate the attributions\nfor the turbo explainable FL (TEFL), wherefore simulation results under\ndifferent methods confirm its superiority over an unconstrained\nIntegrated-Gradient \\emph{post-hoc} FL baseline.\n","authors":["Swastika Roy","Hatim Chergui","Christos Verikoukis"],"pdf_url":"https://arxiv.org/pdf/2210.10147v2.pdf","comment":"Overlapes with the new version"},{"id":"http://arxiv.org/abs/2307.13468v1","updated":"2023-07-25T12:56:41Z","published":"2023-07-25T12:56:41Z","title":"Gaussian Graph with Prototypical Contrastive Learning in E-Commerce\n  Bundle Recommendation","summary":"  Bundle recommendation aims to provide a bundle of items to satisfy the user\npreference on e-commerce platform. Existing successful solutions are based on\nthe contrastive graph learning paradigm where graph neural networks (GNNs) are\nemployed to learn representations from user-level and bundle-level graph views\nwith a contrastive learning module to enhance the cooperative association\nbetween different views. Nevertheless, they ignore the uncertainty issue which\nhas a significant impact in real bundle recommendation scenarios due to the\nlack of discriminative information caused by highly sparsity or diversity. We\nfurther suggest that their instancewise contrastive learning fails to\ndistinguish the semantically similar negatives (i.e., sampling bias issue),\nresulting in performance degradation. In this paper, we propose a novel\nGaussian Graph with Prototypical Contrastive Learning (GPCL) framework to\novercome these challenges. In particular, GPCL embeds each user/bundle/item as\na Gaussian distribution rather than a fixed vector. We further design a\nprototypical contrastive learning module to capture the contextual information\nand mitigate the sampling bias issue. Extensive experiments demonstrate that\nbenefiting from the proposed components, we achieve new state-of-the-art\nperformance compared to previous methods on several public datasets. Moreover,\nGPCL has been deployed on real-world e-commerce platform and achieved\nsubstantial improvements.\n","authors":["Zhao-Yang Liu","Liucheng Sun","Chenwei Weng","Qijin Chen","Chengfu Huo"],"pdf_url":"https://arxiv.org/pdf/2307.13468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13466v1","updated":"2023-07-25T12:51:25Z","published":"2023-07-25T12:51:25Z","title":"Integrating processed-based models and machine learning for crop yield\n  prediction","summary":"  Crop yield prediction typically involves the utilization of either\ntheory-driven process-based crop growth models, which have proven to be\ndifficult to calibrate for local conditions, or data-driven machine learning\nmethods, which are known to require large datasets. In this work we investigate\npotato yield prediction using a hybrid meta-modeling approach. A crop growth\nmodel is employed to generate synthetic data for (pre)training a convolutional\nneural net, which is then fine-tuned with observational data. When applied in\nsilico, our meta-modeling approach yields better predictions than a baseline\ncomprising a purely data-driven approach. When tested on real-world data from\nfield trials (n=303) and commercial fields (n=77), the meta-modeling approach\nyields competitive results with respect to the crop growth model. In the latter\nset, however, both models perform worse than a simple linear regression with a\nhand-picked feature set and dedicated preprocessing designed by domain experts.\nOur findings indicate the potential of meta-modeling for accurate crop yield\nprediction; however, further advancements and validation using extensive\nreal-world datasets is recommended to solidify its practical effectiveness.\n","authors":["Michiel G. J. Kallenberg","Bernardo Maestrini","Ron van Bree","Paul Ravensbergen","Christos Pylianidis","Frits van Evert","Ioannis N. Athanasiadis"],"pdf_url":"https://arxiv.org/pdf/2307.13466v1.pdf","comment":"6 pages, 4 figures, Accepted after peer-review at the 1st workshop on\n  Synergy of Scientific and Machine Learning Modeling, SynS & ML ICML,\n  Honolulu, Hawaii, USA. July, 2023"},{"id":"http://arxiv.org/abs/2307.13460v1","updated":"2023-07-25T12:40:48Z","published":"2023-07-25T12:40:48Z","title":"Fundamental causal bounds of quantum random access memories","summary":"  Quantum devices should operate in adherence to quantum physics principles.\nQuantum random access memory (QRAM), a fundamental component of many essential\nquantum algorithms for tasks such as linear algebra, data search, and machine\nlearning, is often proposed to offer $\\mathcal{O}(\\log N)$ circuit depth for\n$\\mathcal{O}(N)$ data size, given $N$ qubits. However, this claim appears to\nbreach the principle of relativity when dealing with a large number of qubits\nin quantum materials interacting locally. In our study we critically explore\nthe intrinsic bounds of rapid quantum memories based on causality, employing\nthe relativistic quantum field theory and Lieb-Robinson bounds in quantum\nmany-body systems. In this paper, we consider a hardware-efficient QRAM design\nin hybrid quantum acoustic systems. Assuming clock cycle times of approximately\n$10^{-3}$ seconds and a lattice spacing of about 1 micrometer, we show that\nQRAM can accommodate up to $\\mathcal{O}(10^7)$ logical qubits in 1 dimension,\n$\\mathcal{O}(10^{15})$ to $\\mathcal{O}(10^{20})$ in various 2D architectures,\nand $\\mathcal{O}(10^{24})$ in 3 dimensions. We contend that this causality\nbound broadly applies to other quantum hardware systems. Our findings highlight\nthe impact of fundamental quantum physics constraints on the long-term\nperformance of quantum computing applications in data science and suggest\npotential quantum memory designs for performance enhancement.\n","authors":["Yunfei Wang","Yuri Alexeev","Liang Jiang","Frederic T. Chong","Junyu Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13460v1.pdf","comment":"8+24=32 pages, many figures"},{"id":"http://arxiv.org/abs/2307.11465v2","updated":"2023-07-25T12:39:40Z","published":"2023-07-21T10:01:55Z","title":"A Deep Learning Approach for Overall Survival prediction in Lung Cancer\n  with Missing Values","summary":"  One of the most challenging fields where Artificial Intelligence (AI) can be\napplied is lung cancer research, specifically non-small cell lung cancer\n(NSCLC). In particular, overall survival (OS), the time between diagnosis and\ndeath, is a vital indicator of patient status, enabling tailored treatment and\nimproved OS rates. In this analysis, there are two challenges to take into\naccount. First, few studies effectively exploit the information available from\neach patient, leveraging both uncensored (i.e., dead) and censored (i.e.,\nsurvivors) patients, considering also the events' time. Second, the handling of\nincomplete data is a common issue in the medical field. This problem is\ntypically tackled through the use of imputation methods. Our objective is to\npresent an AI model able to overcome these limits, effectively learning from\nboth censored and uncensored patients and their available features, for the\nprediction of OS for NSCLC patients. We present a novel approach to survival\nanalysis with missing values in the context of NSCLC, which exploits the\nstrengths of the transformer architecture to account only for available\nfeatures without requiring any imputation strategy. By making use of ad-hoc\nlosses for OS, it is able to account for both censored and uncensored patients,\nas well as changes in risks over time. We compared our method with\nstate-of-the-art models for survival analysis coupled with different imputation\nstrategies. We evaluated the results obtained over a period of 6 years using\ndifferent time granularities obtaining a Ct-index, a time-dependent variant of\nthe C-index, of 71.97, 77.58 and 80.72 for time units of 1 month, 1 year and 2\nyears, respectively, outperforming all state-of-the-art methods regardless of\nthe imputation method used.\n","authors":["Camillo Maria Caruso","Valerio Guarrasi","Sara Ramella","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2307.11465v2.pdf","comment":"20 pages, 2 figures"},{"id":"http://arxiv.org/abs/2003.03229v4","updated":"2023-07-25T12:25:13Z","published":"2020-02-02T21:09:39Z","title":"Non-linear Neurons with Human-like Apical Dendrite Activations","summary":"  In order to classify linearly non-separable data, neurons are typically\norganized into multi-layer neural networks that are equipped with at least one\nhidden layer. Inspired by some recent discoveries in neuroscience, we propose a\nnew model of artificial neuron along with a novel activation function enabling\nthe learning of nonlinear decision boundaries using a single neuron. We show\nthat a standard neuron followed by our novel apical dendrite activation (ADA)\ncan learn the XOR logical function with 100% accuracy. Furthermore, we conduct\nexperiments on six benchmark data sets from computer vision, signal processing\nand natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST,\nTiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions\nprovide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and\nSwish, for various neural network architectures, e.g. one-hidden-layer or\ntwo-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural\nnetworks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain\nfurther performance improvements when we change the standard model of the\nneuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our\ncode is available at: https://github.com/raduionescu/pynada.\n","authors":["Mariana-Iuliana Georgescu","Radu Tudor Ionescu","Nicolae-Catalin Ristea","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2003.03229v4.pdf","comment":"Accepted for publication in Applied Intelligence"},{"id":"http://arxiv.org/abs/2307.13447v1","updated":"2023-07-25T12:19:35Z","published":"2023-07-25T12:19:35Z","title":"A behavioural transformer for effective collaboration between a robot\n  and a non-stationary human","summary":"  A key challenge in human-robot collaboration is the non-stationarity created\nby humans due to changes in their behaviour. This alters environmental\ntransitions and hinders human-robot collaboration. We propose a principled\nmeta-learning framework to explore how robots could better predict human\nbehaviour, and thereby deal with issues of non-stationarity. On the basis of\nthis framework, we developed Behaviour-Transform (BeTrans). BeTrans is a\nconditional transformer that enables a robot agent to adapt quickly to new\nhuman agents with non-stationary behaviours, due to its notable performance\nwith sequential data. We trained BeTrans on simulated human agents with\ndifferent systematic biases in collaborative settings. We used an original\ncustomisable environment to show that BeTrans effectively collaborates with\nsimulated human agents and adapts faster to non-stationary simulated human\nagents than SOTA techniques.\n","authors":["Ruaridh Mon-Williams","Theodoros Stouraitis","Sethu Vijayakumar"],"pdf_url":"https://arxiv.org/pdf/2307.13447v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.13434v1","updated":"2023-07-25T12:00:48Z","published":"2023-07-25T12:00:48Z","title":"Network Traffic Classification based on Single Flow Time Series Analysis","summary":"  Network traffic monitoring using IP flows is used to handle the current\nchallenge of analyzing encrypted network communication. Nevertheless, the\npacket aggregation into flow records naturally causes information loss;\ntherefore, this paper proposes a novel flow extension for traffic features\nbased on the time series analysis of the Single Flow Time series, i.e., a time\nseries created by the number of bytes in each packet and its timestamp. We\npropose 69 universal features based on the statistical analysis of data points,\ntime domain analysis, packet distribution within the flow timespan, time series\nbehavior, and frequency domain analysis. We have demonstrated the usability and\nuniversality of the proposed feature vector for various network traffic\nclassification tasks using 15 well-known publicly available datasets. Our\nevaluation shows that the novel feature vector achieves classification\nperformance similar or better than related works on both binary and multiclass\nclassification tasks. In more than half of the evaluated tasks, the\nclassification performance increased by up to 5\\%.\n","authors":["Josef Koumar","Karel Hynek","Tomáš Čejka"],"pdf_url":"https://arxiv.org/pdf/2307.13434v1.pdf","comment":"Submitted to The 19th International Conference on Network and Service\n  Management (CNSM) 2023"},{"id":"http://arxiv.org/abs/2307.13430v1","updated":"2023-07-25T11:51:20Z","published":"2023-07-25T11:51:20Z","title":"Achieving Linear Speedup in Decentralized Stochastic Compositional\n  Minimax Optimization","summary":"  The stochastic compositional minimax problem has attracted a surge of\nattention in recent years since it covers many emerging machine learning\nmodels. Meanwhile, due to the emergence of distributed data, optimizing this\nkind of problem under the decentralized setting becomes badly needed. However,\nthe compositional structure in the loss function brings unique challenges to\ndesigning efficient decentralized optimization algorithms. In particular, our\nstudy shows that the standard gossip communication strategy cannot achieve\nlinear speedup for decentralized compositional minimax problems due to the\nlarge consensus error about the inner-level function. To address this issue, we\ndeveloped a novel decentralized stochastic compositional gradient descent\nascent with momentum algorithm to reduce the consensus error in the inner-level\nfunction. As such, our theoretical results demonstrate that it is able to\nachieve linear speedup with respect to the number of workers. We believe this\nnovel algorithmic design could benefit the development of decentralized\ncompositional optimization. Finally, we applied our methods to the imbalanced\nclassification problem. The extensive experimental results provide evidence for\nthe effectiveness of our algorithm.\n","authors":["Hongchang Gao"],"pdf_url":"https://arxiv.org/pdf/2307.13430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13425v1","updated":"2023-07-25T11:45:28Z","published":"2023-07-25T11:45:28Z","title":"A signal processing interpretation of noise-reduction convolutional\n  neural networks","summary":"  Encoding-decoding CNNs play a central role in data-driven noise reduction and\ncan be found within numerous deep-learning algorithms. However, the development\nof these CNN architectures is often done in ad-hoc fashion and theoretical\nunderpinnings for important design choices is generally lacking. Up to this\nmoment there are different existing relevant works that strive to explain the\ninternal operation of these CNNs. Still, these ideas are either scattered\nand/or may require significant expertise to be accessible for a bigger\naudience. In order to open up this exciting field, this article builds\nintuition on the theory of deep convolutional framelets and explains diverse ED\nCNN architectures in a unified theoretical framework. By connecting basic\nprinciples from signal processing to the field of deep learning, this\nself-contained material offers significant guidance for designing robust and\nefficient novel CNN architectures.\n","authors":["Luis A. Zavala-Mondragón","Peter H. N. de With","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2307.13425v1.pdf","comment":"This article is currently accepted in IEEE Signal Processing Magazine\n  (SPM)"},{"id":"http://arxiv.org/abs/2307.13423v1","updated":"2023-07-25T11:42:52Z","published":"2023-07-25T11:42:52Z","title":"Non Intrusive Intelligibility Predictor for Hearing Impaired Individuals\n  using Self Supervised Speech Representations","summary":"  Self-supervised speech representations (SSSRs) have been successfully applied\nto a number of speech-processing tasks, e.g. as feature extractor for speech\nquality (SQ) prediction, which is, in turn, relevant for assessment and\ntraining speech enhancement systems for users with normal or impaired hearing.\nHowever, exact knowledge of why and how quality-related information is encoded\nwell in such representations remains poorly understood. In this work,\ntechniques for non-intrusive prediction of SQ ratings are extended to the\nprediction of intelligibility for hearing-impaired users. It is found that\nself-supervised representations are useful as input features to non-intrusive\nprediction models, achieving competitive performance to more complex systems. A\ndetailed analysis of the performance depending on Clarity Prediction Challenge\n1 listeners and enhancement systems indicates that more data might be needed to\nallow generalisation to unknown systems and (hearing-impaired) individuals\n","authors":["George Close","Thomas Hain","Stefan Goetze"],"pdf_url":"https://arxiv.org/pdf/2307.13423v1.pdf","comment":"Submitted to ASRU 2023"},{"id":"http://arxiv.org/abs/2307.13421v1","updated":"2023-07-25T11:40:47Z","published":"2023-07-25T11:40:47Z","title":"On the learning Dynamics of Attention Networks","summary":"  Attention models are typically learned by optimizing one of three standard\nloss functions that are variously called -- soft attention, hard attention, and\nlatent variable marginal likelihood (LVML) attention. All three paradigms are\nmotivated by the same goal of finding two models -- a `focus' model that\n`selects' the right \\textit{segment} of the input and a `classification' model\nthat processes the selected segment into the target label. However, they differ\nsignificantly in the way the selected segments are aggregated, resulting in\ndistinct dynamics and final results. We observe a unique signature of models\nlearned using these paradigms and explain this as a consequence of the\nevolution of the classification model under gradient descent when the focus\nmodel is fixed. We also analyze these paradigms in a simple setting and derive\nclosed-form expressions for the parameter trajectory under gradient flow. With\nthe soft attention loss, the focus model improves quickly at initialization and\nsplutters later on. On the other hand, hard attention loss behaves in the\nopposite fashion. Based on our observations, we propose a simple hybrid\napproach that combines the advantages of the different loss functions and\ndemonstrates it on a collection of semi-synthetic and real-world datasets\n","authors":["Rahul Vashisht","Harish G. Ramaswamy"],"pdf_url":"https://arxiv.org/pdf/2307.13421v1.pdf","comment":"Preprint: Accepted at ECAI-2023"},{"id":"http://arxiv.org/abs/2307.13419v1","updated":"2023-07-25T11:38:40Z","published":"2023-07-25T11:38:40Z","title":"Co-Design of Out-of-Distribution Detectors for Autonomous Emergency\n  Braking Systems","summary":"  Learning enabled components (LECs), while critical for decision making in\nautonomous vehicles (AVs), are likely to make incorrect decisions when\npresented with samples outside of their training distributions.\nOut-of-distribution (OOD) detectors have been proposed to detect such samples,\nthereby acting as a safety monitor, however, both OOD detectors and LECs\nrequire heavy utilization of embedded hardware typically found in AVs. For both\ncomponents, there is a tradeoff between non-functional and functional\nperformance, and both impact a vehicle's safety. For instance, giving an OOD\ndetector a longer response time can increase its accuracy at the expense of the\nLEC. We consider an LEC with binary output like an autonomous emergency braking\nsystem (AEBS) and use risk, the combination of severity and occurrence of a\nfailure, to model the effect of both components' design parameters on each\nother's functional and non-functional performance, as well as their impact on\nsystem safety. We formulate a co-design methodology that uses this risk model\nto find the design parameters for an OOD detector and LEC that decrease risk\nbelow that of the baseline system and demonstrate it on a vision based AEBS.\nUsing our methodology, we achieve a 42.3% risk reduction while maintaining\nequivalent resource utilization.\n","authors":["Michael Yuhas","Arvind Easwaran"],"pdf_url":"https://arxiv.org/pdf/2307.13419v1.pdf","comment":"8 pages, 6 figures, ITSC 2023"},{"id":"http://arxiv.org/abs/2307.13415v1","updated":"2023-07-25T11:23:38Z","published":"2023-07-25T11:23:38Z","title":"Communication-Efficient Orchestrations for URLLC Service via\n  Hierarchical Reinforcement Learning","summary":"  Ultra-reliable low latency communications (URLLC) service is envisioned to\nenable use cases with strict reliability and latency requirements in 5G. One\napproach for enabling URLLC services is to leverage Reinforcement Learning (RL)\nto efficiently allocate wireless resources. However, with conventional RL\nmethods, the decision variables (though being deployed at various network\nlayers) are typically optimized in the same control loop, leading to\nsignificant practical limitations on the control loop's delay as well as\nexcessive signaling and energy consumption. In this paper, we propose a\nmulti-agent Hierarchical RL (HRL) framework that enables the implementation of\nmulti-level policies with different control loop timescales. Agents with faster\ncontrol loops are deployed closer to the base station, while the ones with\nslower control loops are at the edge or closer to the core network providing\nhigh-level guidelines for low-level actions. On a use case from the prior art,\nwith our HRL framework, we optimized the maximum number of retransmissions and\ntransmission power of industrial devices. Our extensive simulation results on\nthe factory automation scenario show that the HRL framework achieves better\nperformance as the baseline single-agent RL method, with significantly less\noverhead of signal transmissions and delay compared to the one-agent RL\nmethods.\n","authors":["Wei Shi","Milad Ganjalizadeh","Hossein Shokri Ghadikolaei","Marina Petrova"],"pdf_url":"https://arxiv.org/pdf/2307.13415v1.pdf","comment":"This work has been accepted in IEEE 34th Annual International\n  Symposium on Personal, Indoor and Mobile Radio Communications (PIMRC)"},{"id":"http://arxiv.org/abs/2307.13412v1","updated":"2023-07-25T11:19:21Z","published":"2023-07-25T11:19:21Z","title":"Mitigating Memory Wall Effects in CNN Engines with On-the-Fly Weights\n  Generation","summary":"  The unprecedented accuracy of convolutional neural networks (CNNs) across a\nbroad range of AI tasks has led to their widespread deployment in mobile and\nembedded settings. In a pursuit for high-performance and energy-efficient\ninference, significant research effort has been invested in the design of\nFPGA-based CNN accelerators. In this context, single computation engines\nconstitute a popular approach to support diverse CNN modes without the overhead\nof fabric reconfiguration. Nevertheless, this flexibility often comes with\nsignificantly degraded performance on memory-bound layers and resource\nunderutilisation due to the suboptimal mapping of certain layers on the\nengine's fixed configuration. In this work, we investigate the implications in\nterms of CNN engine design for a class of models that introduce a\npre-convolution stage to decompress the weights at run time. We refer to these\napproaches as on-the-fly. This paper presents unzipFPGA, a novel CNN inference\nsystem that counteracts the limitations of existing CNN engines. The proposed\nframework comprises a novel CNN hardware architecture that introduces a weights\ngenerator module that enables the on-chip on-the-fly generation of weights,\nalleviating the negative impact of limited bandwidth on memory-bound layers. We\nfurther enhance unzipFPGA with an automated hardware-aware methodology that\ntailors the weights generation mechanism to the target CNN-device pair, leading\nto an improved accuracy-performance balance. Finally, we introduce an input\nselective processing element (PE) design that balances the load between PEs in\nsuboptimally mapped layers. The proposed framework yields hardware designs that\nachieve an average of 2.57x performance efficiency gain over highly optimised\nGPU designs for the same power constraints and up to 3.94x higher performance\ndensity over a diverse range of state-of-the-art FPGA-based CNN accelerators.\n","authors":["Stylianos I. Venieris","Javier Fernandez-Marques","Nicholas D. Lane"],"pdf_url":"https://arxiv.org/pdf/2307.13412v1.pdf","comment":"Accepted at ACM TODAES, 2023. arXiv admin note: substantial text\n  overlap with arXiv:2103.05600"},{"id":"http://arxiv.org/abs/2307.12754v2","updated":"2023-07-25T11:11:25Z","published":"2023-07-24T12:52:55Z","title":"Nonparametric Linear Feature Learning in Regression Through\n  Regularisation","summary":"  Representation learning plays a crucial role in automated feature selection,\nparticularly in the context of high-dimensional data, where non-parametric\nmethods often struggle. In this study, we focus on supervised learning\nscenarios where the pertinent information resides within a lower-dimensional\nlinear subspace of the data, namely the multi-index model. If this subspace\nwere known, it would greatly enhance prediction, computation, and\ninterpretation. To address this challenge, we propose a novel method for linear\nfeature learning with non-parametric prediction, which simultaneously estimates\nthe prediction function and the linear subspace. Our approach employs empirical\nrisk minimisation, augmented with a penalty on function derivatives, ensuring\nversatility. Leveraging the orthogonality and rotation invariance properties of\nHermite polynomials, we introduce our estimator, named RegFeaL. By utilising\nalternative minimisation, we iteratively rotate the data to improve alignment\nwith leading directions and accurately estimate the relevant dimension in\npractical settings. We establish that our method yields a consistent estimator\nof the prediction function with explicit rates. Additionally, we provide\nempirical results demonstrating the performance of RegFeaL in various\nexperiments.\n","authors":["Bertille Follain","Umut Simsekli","Francis Bach"],"pdf_url":"https://arxiv.org/pdf/2307.12754v2.pdf","comment":"42 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.13408v1","updated":"2023-07-25T11:07:43Z","published":"2023-07-25T11:07:43Z","title":"The Double-Edged Sword of Big Data and Information Technology for the\n  Disadvantaged: A Cautionary Tale from Open Banking","summary":"  This research article analyses and demonstrates the hidden implications for\nfairness of seemingly neutral data coupled with powerful technology, such as\nmachine learning (ML), using Open Banking as an example. Open Banking has\nignited a revolution in financial services, opening new opportunities for\ncustomer acquisition, management, retention, and risk assessment. However, the\ngranularity of transaction data holds potential for harm where unnoticed\nproxies for sensitive and prohibited characteristics may lead to indirect\ndiscrimination. Against this backdrop, we investigate the dimensions of\nfinancial vulnerability (FV), a global concern resulting from COVID-19 and\nrising inflation. Specifically, we look to understand the behavioral elements\nleading up to FV and its impact on at-risk, disadvantaged groups through the\nlens of fair interpretation. Using a unique dataset from a UK FinTech lender,\nwe demonstrate the power of fine-grained transaction data while simultaneously\ncautioning its safe usage. Three ML classifiers are compared in predicting the\nlikelihood of FV, and groups exhibiting different magnitudes and forms of FV\nare identified via clustering to highlight the effects of feature combination.\nOur results indicate that engineered features of financial behavior can be\npredictive of omitted personal information, particularly sensitive or protected\ncharacteristics, shedding light on the hidden dangers of Open Banking data. We\ndiscuss the implications and conclude fairness via unawareness is ineffective\nin this new technological environment.\n","authors":["Savina Dine Kim","Galina Andreeva","Michael Rovatsos"],"pdf_url":"https://arxiv.org/pdf/2307.13408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13390v1","updated":"2023-07-25T10:21:26Z","published":"2023-07-25T10:21:26Z","title":"Counterfactual Explanation via Search in Gaussian Mixture Distributed\n  Latent Space","summary":"  Counterfactual Explanations (CEs) are an important tool in Algorithmic\nRecourse for addressing two questions: 1. What are the crucial factors that led\nto an automated prediction/decision? 2. How can these factors be changed to\nachieve a more favorable outcome from a user's perspective? Thus, guiding the\nuser's interaction with AI systems by proposing easy-to-understand explanations\nand easy-to-attain feasible changes is essential for the trustworthy adoption\nand long-term acceptance of AI systems. In the literature, various methods have\nbeen proposed to generate CEs, and different quality measures have been\nsuggested to evaluate these methods. However, the generation of CEs is usually\ncomputationally expensive, and the resulting suggestions are unrealistic and\nthus non-actionable. In this paper, we introduce a new method to generate CEs\nfor a pre-trained binary classifier by first shaping the latent space of an\nautoencoder to be a mixture of Gaussian distributions. CEs are then generated\nin latent space by linear interpolation between the query sample and the\ncentroid of the target class. We show that our method maintains the\ncharacteristics of the input sample during the counterfactual search. In\nvarious experiments, we show that the proposed method is competitive based on\ndifferent quality measures on image and tabular datasets -- efficiently returns\nresults that are closer to the original data manifold compared to three\nstate-of-the-art methods, which are essential for realistic high-dimensional\nmachine learning applications.\n","authors":["Xuan Zhao","Klaus Broelemann","Gjergji Kasneci"],"pdf_url":"https://arxiv.org/pdf/2307.13390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13386v1","updated":"2023-07-25T10:15:38Z","published":"2023-07-25T10:15:38Z","title":"BotHawk: An Approach for Bots Detection in Open Source Software Projects","summary":"  Social coding platforms have revolutionized collaboration in software\ndevelopment, leading to using software bots for streamlining operations.\nHowever, The presence of open-source software (OSS) bots gives rise to problems\nincluding impersonation, spamming, bias, and security risks. Identifying bot\naccounts and behavior is a challenging task in the OSS project. This research\naims to investigate bots' behavior in open-source software projects and\nidentify bot accounts with maximum possible accuracy. Our team gathered a\ndataset of 19,779 accounts that meet standardized criteria to enable future\nresearch on bots in open-source projects. We follow a rigorous workflow to\nensure that the data we collect is accurate, generalizable, scalable, and\nup-to-date. We've identified four types of bot accounts in open-source software\nprojects by analyzing their behavior across 17 features in 5 dimensions. Our\nteam created BotHawk, a highly effective model for detecting bots in\nopen-source software projects. It outperforms other models, achieving an AUC of\n0.947 and an F1-score of 0.89. BotHawk can detect a wider variety of bots,\nincluding CI/CD and scanning bots. Furthermore, we find that the number of\nfollowers, number of repositories, and tags contain the most relevant features\nto identify the account type.\n","authors":["Fenglin Bi","Zhiwei Zhu","Wei Wang","Xiaoya Xia","Hassan Ali Khan","Peng Pu"],"pdf_url":"https://arxiv.org/pdf/2307.13386v1.pdf","comment":"Dataset, Bots Detection, Classification. Open-source Software Bots"},{"id":"http://arxiv.org/abs/2307.08572v4","updated":"2023-07-25T10:06:18Z","published":"2023-07-17T15:38:11Z","title":"Revisiting the Robustness of the Minimum Error Entropy Criterion: A\n  Transfer Learning Case Study","summary":"  Coping with distributional shifts is an important part of transfer learning\nmethods in order to perform well in real-life tasks. However, most of the\nexisting approaches in this area either focus on an ideal scenario in which the\ndata does not contain noises or employ a complicated training paradigm or model\ndesign to deal with distributional shifts. In this paper, we revisit the\nrobustness of the minimum error entropy (MEE) criterion, a widely used\nobjective in statistical signal processing to deal with non-Gaussian noises,\nand investigate its feasibility and usefulness in real-life transfer learning\nregression tasks, where distributional shifts are common. Specifically, we put\nforward a new theoretical result showing the robustness of MEE against\ncovariate shift. We also show that by simply replacing the mean squared error\n(MSE) loss with the MEE on basic transfer learning algorithms such as\nfine-tuning and linear probing, we can achieve competitive performance with\nrespect to state-of-the-art transfer learning algorithms. We justify our\narguments on both synthetic data and 5 real-world time-series data.\n","authors":["Luis Pedro Silvestrin","Shujian Yu","Mark Hoogendoorn"],"pdf_url":"https://arxiv.org/pdf/2307.08572v4.pdf","comment":"Manuscript accepted at ECAI-23. Code available at\n  https://github.com/lpsilvestrin/mee-finetune"},{"id":"http://arxiv.org/abs/2307.13381v1","updated":"2023-07-25T10:04:33Z","published":"2023-07-25T10:04:33Z","title":"Scaff-PD: Communication Efficient Fair and Robust Federated Learning","summary":"  We present Scaff-PD, a fast and communication-efficient algorithm for\ndistributionally robust federated learning. Our approach improves fairness by\noptimizing a family of distributionally robust objectives tailored to\nheterogeneous clients. We leverage the special structure of these objectives,\nand design an accelerated primal dual (APD) algorithm which uses bias corrected\nlocal steps (as in Scaffold) to achieve significant gains in communication\nefficiency and convergence speed. We evaluate Scaff-PD on several benchmark\ndatasets and demonstrate its effectiveness in improving fairness and robustness\nwhile maintaining competitive accuracy. Our results suggest that Scaff-PD is a\npromising approach for federated learning in resource-constrained and\nheterogeneous settings.\n","authors":["Yaodong Yu","Sai Praneeth Karimireddy","Yi Ma","Michael I. Jordan"],"pdf_url":"https://arxiv.org/pdf/2307.13381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14675v3","updated":"2023-07-25T09:55:38Z","published":"2023-05-24T03:32:31Z","title":"TriMLP: Revenge of a MLP-like Architecture in Sequential Recommendation","summary":"  In this paper, we present a MLP-like architecture for sequential\nrecommendation, namely TriMLP, with a novel Triangular Mixer for cross-token\ncommunications. In designing Triangular Mixer, we simplify the cross-token\noperation in MLP as the basic matrix multiplication, and drop the\nlower-triangle neurons of the weight matrix to block the anti-chronological\norder connections from future tokens. Accordingly, the information leakage\nissue can be remedied and the prediction capability of MLP can be fully\nexcavated under the standard auto-regressive mode. Take a step further, the\nmixer serially alternates two delicate MLPs with triangular shape, tagged as\nglobal and local mixing, to separately capture the long range dependencies and\nlocal patterns on fine-grained level, i.e., long and short-term preferences.\nEmpirical study on 12 datasets of different scales (50K\\textasciitilde 10M\nuser-item interactions) from 4 benchmarks (Amazon, MovieLens, Tenrec and LBSN)\nshow that TriMLP consistently attains promising accuracy/efficiency trade-off,\nwhere the average performance boost against several state-of-the-art baselines\nachieves up to 14.88% with 8.65% less inference cost.\n","authors":["Yiheng Jiang","Yuanbo Xu","Yongjian Yang","Funing Yang","Pengyang Wang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2305.14675v3.pdf","comment":"15 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.13372v1","updated":"2023-07-25T09:46:02Z","published":"2023-07-25T09:46:02Z","title":"Submodular Reinforcement Learning","summary":"  In reinforcement learning (RL), rewards of states are typically considered\nadditive, and following the Markov assumption, they are $\\textit{independent}$\nof states visited previously. In many important applications, such as coverage\ncontrol, experiment design and informative path planning, rewards naturally\nhave diminishing returns, i.e., their value decreases in light of similar\nstates visited previously. To tackle this, we propose $\\textit{submodular RL}$\n(SubRL), a paradigm which seeks to optimize more general, non-additive (and\nhistory-dependent) rewards modelled via submodular set functions which capture\ndiminishing returns. Unfortunately, in general, even in tabular settings, we\nshow that the resulting optimization problem is hard to approximate. On the\nother hand, motivated by the success of greedy algorithms in classical\nsubmodular optimization, we propose SubPO, a simple policy gradient-based\nalgorithm for SubRL that handles non-additive rewards by greedily maximizing\nmarginal gains. Indeed, under some assumptions on the underlying Markov\nDecision Process (MDP), SubPO recovers optimal constant factor approximations\nof submodular bandits. Moreover, we derive a natural policy gradient approach\nfor locally optimizing SubRL instances even in large state- and action- spaces.\nWe showcase the versatility of our approach by applying SubPO to several\napplications, such as biodiversity monitoring, Bayesian experiment design,\ninformative path planning, and coverage maximization. Our results demonstrate\nsample efficiency, as well as scalability to high-dimensional state-action\nspaces.\n","authors":["Manish Prajapat","Mojmír Mutný","Melanie N. Zeilinger","Andreas Krause"],"pdf_url":"https://arxiv.org/pdf/2307.13372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13371v1","updated":"2023-07-25T09:45:47Z","published":"2023-07-25T09:45:47Z","title":"Learning Regions of Interest for Bayesian Optimization with Adaptive\n  Level-Set Estimation","summary":"  We study Bayesian optimization (BO) in high-dimensional and non-stationary\nscenarios. Existing algorithms for such scenarios typically require extensive\nhyperparameter tuning, which limits their practical effectiveness. We propose a\nframework, called BALLET, which adaptively filters for a high-confidence region\nof interest (ROI) as a superlevel-set of a nonparametric probabilistic model\nsuch as a Gaussian process (GP). Our approach is easy to tune, and is able to\nfocus on local region of the optimization space that can be tackled by existing\nBO methods. The key idea is to use two probabilistic models: a coarse GP to\nidentify the ROI, and a localized GP for optimization within the ROI. We show\ntheoretically that BALLET can efficiently shrink the search space, and can\nexhibit a tighter regret bound than standard BO without ROI filtering. We\ndemonstrate empirically the effectiveness of BALLET on both synthetic and\nreal-world optimization tasks.\n","authors":["Fengxue Zhang","Jialin Song","James Bowden","Alexander Ladd","Yisong Yue","Thomas A. Desautels","Yuxin Chen"],"pdf_url":"https://arxiv.org/pdf/2307.13371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13370v1","updated":"2023-07-25T09:42:31Z","published":"2023-07-25T09:42:31Z","title":"Computational Guarantees for Doubly Entropic Wasserstein Barycenters via\n  Damped Sinkhorn Iterations","summary":"  We study the computation of doubly regularized Wasserstein barycenters, a\nrecently introduced family of entropic barycenters governed by inner and outer\nregularization strengths. Previous research has demonstrated that various\nregularization parameter choices unify several notions of entropy-penalized\nbarycenters while also revealing new ones, including a special case of debiased\nbarycenters. In this paper, we propose and analyze an algorithm for computing\ndoubly regularized Wasserstein barycenters. Our procedure builds on damped\nSinkhorn iterations followed by exact maximization/minimization steps and\nguarantees convergence for any choice of regularization parameters. An inexact\nvariant of our algorithm, implementable using approximate Monte Carlo sampling,\noffers the first non-asymptotic convergence guarantees for approximating\nWasserstein barycenters between discrete point clouds in the\nfree-support/grid-free setting.\n","authors":["Lénaïc Chizat","Tomas Vaškevičius"],"pdf_url":"https://arxiv.org/pdf/2307.13370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13352v1","updated":"2023-07-25T09:14:45Z","published":"2023-07-25T09:14:45Z","title":"High Dimensional Distributed Gradient Descent with Arbitrary Number of\n  Byzantine Attackers","summary":"  Robust distributed learning with Byzantine failures has attracted extensive\nresearch interests in recent years. However, most of existing methods suffer\nfrom curse of dimensionality, which is increasingly serious with the growing\ncomplexity of modern machine learning models. In this paper, we design a new\nmethod that is suitable for high dimensional problems, under arbitrary number\nof Byzantine attackers. The core of our design is a direct high dimensional\nsemi-verified mean estimation method. Our idea is to identify a subspace first.\nThe components of mean value perpendicular to this subspace can be estimated\nvia gradient vectors uploaded from worker machines, while the components within\nthis subspace are estimated using auxiliary dataset. We then use our new method\nas the aggregator of distributed learning problems. Our theoretical analysis\nshows that the new method has minimax optimal statistical rates. In particular,\nthe dependence on dimensionality is significantly improved compared with\nprevious works.\n","authors":["Puning Zhao","Zhiguo Wan"],"pdf_url":"https://arxiv.org/pdf/2307.13352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02164v3","updated":"2023-07-25T09:11:48Z","published":"2023-05-03T14:55:43Z","title":"Nonparametric Generative Modeling with Conditional Sliced-Wasserstein\n  Flows","summary":"  Sliced-Wasserstein Flow (SWF) is a promising approach to nonparametric\ngenerative modeling but has not been widely adopted due to its suboptimal\ngenerative quality and lack of conditional modeling capabilities. In this work,\nwe make two major contributions to bridging this gap. First, based on a\npleasant observation that (under certain conditions) the SWF of joint\ndistributions coincides with those of conditional distributions, we propose\nConditional Sliced-Wasserstein Flow (CSWF), a simple yet effective extension of\nSWF that enables nonparametric conditional modeling. Second, we introduce\nappropriate inductive biases of images into SWF with two techniques inspired by\nlocal connectivity and multiscale representation in vision research, which\ngreatly improve the efficiency and quality of modeling images. With all the\nimprovements, we achieve generative performance comparable with many deep\nparametric generative models on both conditional and unconditional tasks in a\npurely nonparametric fashion, demonstrating its great potential.\n","authors":["Chao Du","Tianbo Li","Tianyu Pang","Shuicheng Yan","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2305.02164v3.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2306.10891v3","updated":"2023-07-25T09:03:24Z","published":"2023-06-19T12:36:54Z","title":"Transformer Training Strategies for Forecasting Multiple Load Time\n  Series","summary":"  In the smart grid of the future, accurate load forecasts on the level of\nindividual clients can help to balance supply and demand locally and to prevent\ngrid outages. While the number of monitored clients will increase with the\nongoing smart meter rollout, the amount of data per client will always be\nlimited. We evaluate whether a Transformer load forecasting model benefits from\na transfer learning strategy, where a global univariate model is trained on the\nload time series from multiple clients. In experiments with two datasets\ncontaining load time series from several hundred clients, we find that the\nglobal training strategy is superior to the multivariate and local training\nstrategies used in related work. On average, the global training strategy\nresults in 21.8% and 12.8% lower forecasting errors than the two other\nstrategies, measured across forecasting horizons from one day to one month into\nthe future. A comparison to linear models, multi-layer perceptrons and LSTMs\nshows that Transformers are effective for load forecasting when they are\ntrained with the global training strategy.\n","authors":["Matthias Hertel","Maximilian Beichter","Benedikt Heidrich","Oliver Neumann","Benjamin Schäfer","Ralf Mikut","Veit Hagenmeyer"],"pdf_url":"https://arxiv.org/pdf/2306.10891v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13333v1","updated":"2023-07-25T08:45:41Z","published":"2023-07-25T08:45:41Z","title":"Feature Importance Measurement based on Decision Tree Sampling","summary":"  Random forest is effective for prediction tasks but the randomness of tree\ngeneration hinders interpretability in feature importance analysis. To address\nthis, we proposed DT-Sampler, a SAT-based method for measuring feature\nimportance in tree-based model. Our method has fewer parameters than random\nforest and provides higher interpretability and stability for the analysis in\nreal-world problems. An implementation of DT-Sampler is available at\nhttps://github.com/tsudalab/DT-sampler.\n","authors":["Chao Huang","Diptesh Das","Koji Tsuda"],"pdf_url":"https://arxiv.org/pdf/2307.13333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13332v1","updated":"2023-07-25T08:44:58Z","published":"2023-07-25T08:44:58Z","title":"The Optimal Approximation Factors in Misspecified Off-Policy Value\n  Function Estimation","summary":"  Theoretical guarantees in reinforcement learning (RL) are known to suffer\nmultiplicative blow-up factors with respect to the misspecification error of\nfunction approximation. Yet, the nature of such \\emph{approximation factors} --\nespecially their optimal form in a given learning problem -- is poorly\nunderstood. In this paper we study this question in linear off-policy value\nfunction estimation, where many open questions remain. We study the\napproximation factor in a broad spectrum of settings, such as with the weighted\n$L_2$-norm (where the weighting is the offline state distribution), the\n$L_\\infty$ norm, the presence vs. absence of state aliasing, and full vs.\npartial coverage of the state space. We establish the optimal asymptotic\napproximation factors (up to constants) for all of these settings. In\nparticular, our bounds identify two instance-dependent factors for the\n$L_2(\\mu)$ norm and only one for the $L_\\infty$ norm, which are shown to\ndictate the hardness of off-policy evaluation under misspecification.\n","authors":["Philip Amortila","Nan Jiang","Csaba Szepesvári"],"pdf_url":"https://arxiv.org/pdf/2307.13332v1.pdf","comment":"Accepted to ICML 2023. The arXiv version contains improved results"},{"id":"http://arxiv.org/abs/2203.07747v5","updated":"2023-07-25T08:19:39Z","published":"2022-03-15T09:38:15Z","title":"Real-time Neural-MPC: Deep Learning Model Predictive Control for\n  Quadrotors and Agile Robotic Platforms","summary":"  Model Predictive Control (MPC) has become a popular framework in embedded\ncontrol for high-performance autonomous systems. However, to achieve good\ncontrol performance using MPC, an accurate dynamics model is key. To maintain\nreal-time operation, the dynamics models used on embedded systems have been\nlimited to simple first-principle models, which substantially limits their\nrepresentative power. In contrast to such simple models, machine learning\napproaches, specifically neural networks, have been shown to accurately model\neven complex dynamic effects, but their large computational complexity hindered\ncombination with fast real-time iteration loops. With this work, we present\nReal-time Neural MPC, a framework to efficiently integrate large, complex\nneural network architectures as dynamics models within a model-predictive\ncontrol pipeline. Our experiments, performed in simulation and the real world\nonboard a highly agile quadrotor platform, demonstrate the capabilities of the\ndescribed system to run learned models with, previously infeasible, large\nmodeling capacity using gradient-based online optimization MPC. Compared to\nprior implementations of neural networks in online optimization MPC we can\nleverage models of over 4000 times larger parametric capacity in a 50Hz\nreal-time window on an embedded platform. Further, we show the feasibility of\nour framework on real-world problems by reducing the positional tracking error\nby up to 82% when compared to state-of-the-art MPC approaches without neural\nnetwork dynamics.\n","authors":["Tim Salzmann","Elia Kaufmann","Jon Arrizabalaga","Marco Pavone","Davide Scaramuzza","Markus Ryll"],"pdf_url":"https://arxiv.org/pdf/2203.07747v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02119v4","updated":"2023-07-25T08:00:40Z","published":"2023-02-04T07:31:36Z","title":"Diversity Induced Environment Design via Self-Play","summary":"  Recent work on designing an appropriate distribution of environments has\nshown promise for training effective generally capable agents. Its success is\npartly because of a form of adaptive curriculum learning that generates\nenvironment instances (or levels) at the frontier of the agent's capabilities.\nHowever, such an environment design framework often struggles to find effective\nlevels in challenging design spaces and requires costly interactions with the\nenvironment. In this paper, we aim to introduce diversity in the Unsupervised\nEnvironment Design (UED) framework. Specifically, we propose a task-agnostic\nmethod to identify observed/hidden states that are representative of a given\nlevel. The outcome of this method is then utilized to characterize the\ndiversity between two levels, which as we show can be crucial to effective\nperformance. In addition, to improve sampling efficiency, we incorporate the\nself-play technique that allows the environment generator to automatically\ngenerate environments that are of great benefit to the training agent.\nQuantitatively, our approach, Diversity-induced Environment Design via\nSelf-Play (DivSP), shows compelling performance over existing methods.\n","authors":["Dexun Li","Wenjun Li","Pradeep Varakantham"],"pdf_url":"https://arxiv.org/pdf/2302.02119v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11787v2","updated":"2023-07-25T07:56:45Z","published":"2023-04-24T01:48:01Z","title":"B2Opt: Learning to Optimize Black-box Optimization with Little Budget","summary":"  The core challenge of high-dimensional and expensive black-box optimization\n(BBO) is how to obtain better performance faster with little function\nevaluation cost. The essence of the problem is how to design an efficient\noptimization strategy tailored to the target task. This paper designs a\npowerful optimization framework to automatically learn the optimization\nstrategies from the target or cheap surrogate task without human intervention.\nHowever, current methods are weak for this due to poor representation of\noptimization strategy. To achieve this, 1) drawing on the mechanism of genetic\nalgorithm, we propose a deep neural network framework called B2Opt, which has a\nstronger representation of optimization strategies based on survival of the\nfittest; 2) B2Opt can utilize the cheap surrogate functions of the target task\nto guide the design of the efficient optimization strategies. Compared to the\nstate-of-the-art BBO baselines, B2Opt can achieve multiple orders of magnitude\nperformance improvement with less function evaluation cost. We validate our\nproposal on high-dimensional synthetic functions and two real-world\napplications. We also find that deep B2Opt performs better than shallow ones.\n","authors":["Xiaobin Li","Kai Wu","Xiaoyu Zhang","Handing Wang","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2304.11787v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13304v1","updated":"2023-07-25T07:44:06Z","published":"2023-07-25T07:44:06Z","title":"QuIP: 2-Bit Quantization of Large Language Models With Guarantees","summary":"  This work studies post-training parameter quantization in large language\nmodels (LLMs). We introduce quantization with incoherence processing (QuIP), a\nnew method based on the insight that quantization benefits from incoherent\nweight and Hessian matrices, i.e., from the weights and the directions in which\nit is important to round them accurately being unaligned with the coordinate\naxes. QuIP consists of two steps: (1) an adaptive rounding procedure minimizing\na quadratic proxy objective; (2) efficient pre- and post-processing that\nensures weight and Hessian incoherence via multiplication by random orthogonal\nmatrices. We complement QuIP with the first theoretical analysis for an\nLLM-scale quantization algorithm, and show that our theory also applies to an\nexisting method, OPTQ. Empirically, we find that our incoherence preprocessing\nimproves several existing quantization algorithms and yields the first LLM\nquantization methods that produce viable results using only two bits per\nweight. Our code can be found at https://github.com/jerry-chee/QuIP .\n","authors":["Jerry Chee","Yaohui Cai","Volodymyr Kuleshov","Christopher De Sa"],"pdf_url":"https://arxiv.org/pdf/2307.13304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.08232v4","updated":"2023-07-25T07:23:18Z","published":"2021-05-18T02:17:59Z","title":"Sharp Restricted Isometry Property Bounds for Low-rank Matrix Recovery\n  Problems with Corrupted Measurements","summary":"  In this paper, we study a general low-rank matrix recovery problem with\nlinear measurements corrupted by some noise. The objective is to understand\nunder what conditions on the restricted isometry property (RIP) of the problem\nlocal search methods can find the ground truth with a small error. By analyzing\nthe landscape of the non-convex problem, we first propose a global guarantee on\nthe maximum distance between an arbitrary local minimizer and the ground truth\nunder the assumption that the RIP constant is smaller than $1/2$. We show that\nthis distance shrinks to zero as the intensity of the noise reduces. Our new\nguarantee is sharp in terms of the RIP constant and is much stronger than the\nexisting results. We then present a local guarantee for problems with an\narbitrary RIP constant, which states that any local minimizer is either\nconsiderably close to the ground truth or far away from it. Next, we prove the\nstrict saddle property, which guarantees the global convergence of the\nperturbed gradient descent method in polynomial time. The developed results\ndemonstrate how the noise intensity and the RIP constant of the problem affect\nthe landscape of the problem.\n","authors":["Ziye Ma","Yingjie Bi","Javad Lavaei","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2105.08232v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19557v2","updated":"2023-07-25T07:18:49Z","published":"2023-05-31T04:54:06Z","title":"Dictionary Learning under Symmetries via Group Representations","summary":"  The dictionary learning problem can be viewed as a data-driven process to\nlearn a suitable transformation so that data is sparsely represented directly\nfrom example data. In this paper, we examine the problem of learning a\ndictionary that is invariant under a pre-specified group of transformations.\nNatural settings include Cryo-EM, multi-object tracking, synchronization, pose\nestimation, etc. We specifically study this problem under the lens of\nmathematical representation theory. Leveraging the power of non-abelian Fourier\nanalysis for functions over compact groups, we prescribe an algorithmic recipe\nfor learning dictionaries that obey such invariances. We relate the dictionary\nlearning problem in the physical domain, which is naturally modelled as being\ninfinite dimensional, with the associated computational problem, which is\nnecessarily finite dimensional. We establish that the dictionary learning\nproblem can be effectively understood as an optimization instance over certain\nmatrix orbitopes having a particular block-diagonal structure governed by the\nirreducible representations of the group of symmetries. This perspective\nenables us to introduce a band-limiting procedure which obtains dimensionality\nreduction in applications. We provide guarantees for our computational ansatz\nto provide a desirable dictionary learning outcome. We apply our paradigm to\ninvestigate the dictionary learning problem for the groups SO(2) and SO(3).\nWhile the SO(2)-orbitope admits an exact spectrahedral description,\nsubstantially less is understood about the SO(3)-orbitope. We describe a\ntractable spectrahedral outer approximation of the SO(3)-orbitope, and\ncontribute an alternating minimization paradigm to perform optimization in this\nsetting. We provide numerical experiments to highlight the efficacy of our\napproach in learning SO(3)-invariant dictionaries, both on synthetic and on\nreal world data.\n","authors":["Subhroshekhar Ghosh","Aaron Y. R. Low","Yong Sheng Soh","Zhuohang Feng","Brendan K. Y. Tan"],"pdf_url":"https://arxiv.org/pdf/2305.19557v2.pdf","comment":"29 pages, 2 figures"},{"id":"http://arxiv.org/abs/2204.09495v2","updated":"2023-07-25T07:11:39Z","published":"2022-04-19T15:51:04Z","title":"ROI: A method for identifying organizations receiving personal data","summary":"  Many studies have exposed the massive collection of personal data in the\ndigital ecosystem through, for instance, websites, mobile apps, or smart\ndevices. This fact goes unnoticed by most users, who are also unaware that the\ncollectors are sharing their personal data with many different organizations\naround the globe. This paper assesses techniques available in the state of the\nart to identify the organizations receiving this personal data. Based on our\nfindings, we propose ROI (Receiver Organization Identifier), a fully automated\nmethod that combines different techniques to achieve a 95.71% precision score\nin identifying an organization receiving personal data. We demonstrate our\nmethod in the wild by evaluating 10,000 Android apps and exposing the\norganizations that receive users' personal data.\n","authors":["David Rodriguez","Jose M. Del Alamo","Miguel Cozar","Boni Garcia"],"pdf_url":"https://arxiv.org/pdf/2204.09495v2.pdf","comment":"23 pages, 10 figures"},{"id":"http://arxiv.org/abs/2307.13290v1","updated":"2023-07-25T07:11:30Z","published":"2023-07-25T07:11:30Z","title":"Modify Training Directions in Function Space to Reduce Generalization\n  Error","summary":"  We propose theoretical analyses of a modified natural gradient descent method\nin the neural network function space based on the eigendecompositions of neural\ntangent kernel and Fisher information matrix. We firstly present analytical\nexpression for the function learned by this modified natural gradient under the\nassumptions of Gaussian distribution and infinite width limit. Thus, we\nexplicitly derive the generalization error of the learned neural network\nfunction using theoretical methods from eigendecomposition and statistics\ntheory. By decomposing of the total generalization error attributed to\ndifferent eigenspace of the kernel in function space, we propose a criterion\nfor balancing the errors stemming from training set and the distribution\ndiscrepancy between the training set and the true data. Through this approach,\nwe establish that modifying the training direction of the neural network in\nfunction space leads to a reduction in the total generalization error.\nFurthermore, We demonstrate that this theoretical framework is capable to\nexplain many existing results of generalization enhancing methods. These\ntheoretical results are also illustrated by numerical examples on synthetic\ndata.\n","authors":["Yi Yu","Wenlian Lu","Boyu Chen"],"pdf_url":"https://arxiv.org/pdf/2307.13290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07136v2","updated":"2023-07-25T06:53:33Z","published":"2023-04-14T14:01:12Z","title":"One Explanation Does Not Fit XIL","summary":"  Current machine learning models produce outstanding results in many areas\nbut, at the same time, suffer from shortcut learning and spurious correlations.\nTo address such flaws, the explanatory interactive machine learning (XIL)\nframework has been proposed to revise a model by employing user feedback on a\nmodel's explanation. This work sheds light on the explanations used within this\nframework. In particular, we investigate simultaneous model revision through\nmultiple explanation methods. To this end, we identified that \\textit{one\nexplanation does not fit XIL} and propose considering multiple ones when\nrevising models via XIL.\n","authors":["Felix Friedrich","David Steinmann","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2304.07136v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08621v3","updated":"2023-07-25T06:47:43Z","published":"2023-07-17T16:40:01Z","title":"Retentive Network: A Successor to Transformer for Large Language Models","summary":"  In this work, we propose Retentive Network (RetNet) as a foundation\narchitecture for large language models, simultaneously achieving training\nparallelism, low-cost inference, and good performance. We theoretically derive\nthe connection between recurrence and attention. Then we propose the retention\nmechanism for sequence modeling, which supports three computation paradigms,\ni.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel\nrepresentation allows for training parallelism. The recurrent representation\nenables low-cost $O(1)$ inference, which improves decoding throughput, latency,\nand GPU memory without sacrificing performance. The chunkwise recurrent\nrepresentation facilitates efficient long-sequence modeling with linear\ncomplexity, where each chunk is encoded parallelly while recurrently\nsummarizing the chunks. Experimental results on language modeling show that\nRetNet achieves favorable scaling results, parallel training, low-cost\ndeployment, and efficient inference. The intriguing properties make RetNet a\nstrong successor to Transformer for large language models. Code will be\navailable at https://aka.ms/retnet.\n","authors":["Yutao Sun","Li Dong","Shaohan Huang","Shuming Ma","Yuqing Xia","Jilong Xue","Jianyong Wang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2307.08621v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13275v1","updated":"2023-07-25T06:13:01Z","published":"2023-07-25T06:13:01Z","title":"Curvature-based Transformer for Molecular Property Prediction","summary":"  The prediction of molecular properties is one of the most important and\nchallenging tasks in the field of artificial intelligence-based drug design.\nAmong the current mainstream methods, the most commonly used feature\nrepresentation for training DNN models is based on SMILES and molecular graphs,\nalthough these methods are concise and effective, they also limit the ability\nto capture spatial information. In this work, we propose Curvature-based\nTransformer to improve the ability of Graph Transformer neural network models\nto extract structural information on molecular graph data by introducing\nDiscretization of Ricci Curvature. To embed the curvature in the model, we add\nthe curvature information of the graph as positional Encoding to the node\nfeatures during the attention-score calculation. This method can introduce\ncurvature information from graph data without changing the original network\narchitecture, and it has the potential to be extended to other models. We\nperformed experiments on chemical molecular datasets including PCQM4M-LST,\nMoleculeNet and compared with models such as Uni-Mol, Graphormer, and the\nresults show that this method can achieve the state-of-the-art results. It is\nproved that the discretized Ricci curvature also reflects the structural and\nfunctional relationship while describing the local geometry of the graph\nmolecular data.\n","authors":["Yili Chen","Zhengyu Li","Zheng Wan","Hui Yu","Xian Wei"],"pdf_url":"https://arxiv.org/pdf/2307.13275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07840v2","updated":"2023-07-25T06:03:42Z","published":"2023-07-15T16:16:22Z","title":"RegExplainer: Generating Explanations for Graph Neural Networks in\n  Regression Task","summary":"  Graph regression is a fundamental task and has received increasing attention\nin a wide range of graph learning tasks. However, the inference process is\noften not interpretable. Most existing explanation techniques are limited to\nunderstanding GNN behaviors in classification tasks. In this work, we seek an\nexplanation to interpret the graph regression models (XAIG-R). We show that\nexisting methods overlook the distribution shifting and continuously ordered\ndecision boundary, which hinders them away from being applied in the regression\ntasks. To address these challenges, we propose a novel objective based on the\ninformation bottleneck theory and introduce a new mix-up framework, which could\nsupport various GNNs in a model-agnostic manner. We further present a\ncontrastive learning strategy to tackle the continuously ordered labels in\nregression task. To empirically verify the effectiveness of the proposed\nmethod, we introduce three benchmark datasets and a real-life dataset for\nevaluation. Extensive experiments show the effectiveness of the proposed method\nin interpreting GNN models in regression tasks.\n","authors":["Jiaxing Zhang","Zhuomin Chen","Hao Mei","Dongsheng Luo","Hua Wei"],"pdf_url":"https://arxiv.org/pdf/2307.07840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13270v1","updated":"2023-07-25T05:45:52Z","published":"2023-07-25T05:45:52Z","title":"Unbiased Weight Maximization","summary":"  A biologically plausible method for training an Artificial Neural Network\n(ANN) involves treating each unit as a stochastic Reinforcement Learning (RL)\nagent, thereby considering the network as a team of agents. Consequently, all\nunits can learn via REINFORCE, a local learning rule modulated by a global\nreward signal, which aligns more closely with biologically observed forms of\nsynaptic plasticity. Nevertheless, this learning method is often slow and\nscales poorly with network size due to inefficient structural credit\nassignment, since a single reward signal is broadcast to all units without\nconsidering individual contributions. Weight Maximization, a proposed solution,\nreplaces a unit's reward signal with the norm of its outgoing weight, thereby\nallowing each hidden unit to maximize the norm of the outgoing weight instead\nof the global reward signal. In this research report, we analyze the\ntheoretical properties of Weight Maximization and propose a variant, Unbiased\nWeight Maximization. This new approach provides an unbiased learning rule that\nincreases learning speed and improves asymptotic performance. Notably, to our\nknowledge, this is the first learning rule for a network of Bernoulli-logistic\nunits that is unbiased and scales well with the number of network's units in\nterms of learning speed.\n","authors":["Stephen Chung"],"pdf_url":"https://arxiv.org/pdf/2307.13270v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2306.15865v2","updated":"2023-07-25T05:44:09Z","published":"2023-06-28T01:41:30Z","title":"Differentially Private Distributed Estimation and Learning","summary":"  We study distributed estimation and learning problems in a networked\nenvironment in which agents exchange information to estimate unknown\nstatistical properties of random variables from their privately observed\nsamples. By exchanging information about their private observations, the agents\ncan collectively estimate the unknown quantities, but they also face privacy\nrisks. The goal of our aggregation schemes is to combine the observed data\nefficiently over time and across the network, while accommodating the privacy\nneeds of the agents and without any coordination beyond their local\nneighborhoods. Our algorithms enable the participating agents to estimate a\ncomplete sufficient statistic from private signals that are acquired offline or\nonline over time, and to preserve the privacy of their signals and network\nneighborhoods. This is achieved through linear aggregation schemes with\nadjusted randomization schemes that add noise to the exchanged estimates\nsubject to differential privacy (DP) constraints. In every case, we demonstrate\nthe efficiency of our algorithms by proving convergence to the estimators of a\nhypothetical, omniscient observer that has central access to all of the\nsignals. We also provide convergence rate analysis and finite-time performance\nguarantees and show that the noise that minimizes the convergence time to the\nbest estimates is the Laplace noise, with parameters corresponding to each\nagent's sensitivity to their signal and network characteristics. Finally, to\nsupplement and validate our theoretical results, we run experiments on\nreal-world data from the US Power Grid Network and electric consumption data\nfrom German Households to estimate the average power consumption of power\nstations and households under all privacy regimes.\n","authors":["Marios Papachristou","M. Amin Rahimian"],"pdf_url":"https://arxiv.org/pdf/2306.15865v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13267v1","updated":"2023-07-25T05:34:50Z","published":"2023-07-25T05:34:50Z","title":"Federated K-Means Clustering via Dual Decomposition-based Distributed\n  Optimization","summary":"  The use of distributed optimization in machine learning can be motivated\neither by the resulting preservation of privacy or the increase in\ncomputational efficiency. On the one hand, training data might be stored across\nmultiple devices. Training a global model within a network where each node only\nhas access to its confidential data requires the use of distributed algorithms.\nEven if the data is not confidential, sharing it might be prohibitive due to\nbandwidth limitations. On the other hand, the ever-increasing amount of\navailable data leads to large-scale machine learning problems. By splitting the\ntraining process across multiple nodes its efficiency can be significantly\nincreased. This paper aims to demonstrate how dual decomposition can be applied\nfor distributed training of $ K $-means clustering problems. After an overview\nof distributed and federated machine learning, the mixed-integer quadratically\nconstrained programming-based formulation of the $ K $-means clustering\ntraining problem is presented. The training can be performed in a distributed\nmanner by splitting the data across different nodes and linking these nodes\nthrough consensus constraints. Finally, the performance of the subgradient\nmethod, the bundle trust method, and the quasi-Newton dual ascent algorithm are\nevaluated on a set of benchmark problems. While the mixed-integer\nprogramming-based formulation of the clustering problems suffers from weak\ninteger relaxations, the presented approach can potentially be used to enable\nan efficient solution in the future, both in a central and distributed setting.\n","authors":["Vassilios Yfantis","Achim Wagner","Martin Ruskowski"],"pdf_url":"https://arxiv.org/pdf/2307.13267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13266v1","updated":"2023-07-25T05:33:06Z","published":"2023-07-25T05:33:06Z","title":"Federated Split Learning with Only Positive Labels for\n  resource-constrained IoT environment","summary":"  Distributed collaborative machine learning (DCML) is a promising method in\nthe Internet of Things (IoT) domain for training deep learning models, as data\nis distributed across multiple devices. A key advantage of this approach is\nthat it improves data privacy by removing the necessity for the centralized\naggregation of raw data but also empowers IoT devices with low computational\npower. Among various techniques in a DCML framework, federated split learning,\nknown as splitfed learning (SFL), is the most suitable for efficient training\nand testing when devices have limited computational capabilities. Nevertheless,\nwhen resource-constrained IoT devices have only positive labeled data,\nmulticlass classification deep learning models in SFL fail to converge or\nprovide suboptimal results. To overcome these challenges, we propose splitfed\nlearning with positive labels (SFPL). SFPL applies a random shuffling function\nto the smashed data received from clients before supplying it to the server for\nmodel training. Additionally, SFPL incorporates the local batch normalization\nfor the client-side model portion during the inference phase. Our results\ndemonstrate that SFPL outperforms SFL: (i) by factors of 51.54 and 32.57 for\nResNet-56 and ResNet-32, respectively, with the CIFAR-100 dataset, and (ii) by\nfactors of 9.23 and 8.52 for ResNet-32 and ResNet-8, respectively, with\nCIFAR-10 dataset. Overall, this investigation underscores the efficacy of the\nproposed SFPL framework in DCML.\n","authors":["Praveen Joshi","Chandra Thapa","Mohammed Hasanuzzaman","Ted Scully","Haithem Afli"],"pdf_url":"https://arxiv.org/pdf/2307.13266v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.13256v1","updated":"2023-07-25T04:55:45Z","published":"2023-07-25T04:55:45Z","title":"Structural Credit Assignment with Coordinated Exploration","summary":"  A biologically plausible method for training an Artificial Neural Network\n(ANN) involves treating each unit as a stochastic Reinforcement Learning (RL)\nagent, thereby considering the network as a team of agents. Consequently, all\nunits can learn via REINFORCE, a local learning rule modulated by a global\nreward signal, which aligns more closely with biologically observed forms of\nsynaptic plasticity. However, this learning method tends to be slow and does\nnot scale well with the size of the network. This inefficiency arises from two\nfactors impeding effective structural credit assignment: (i) all units\nindependently explore the network, and (ii) a single reward is used to evaluate\nthe actions of all units. Accordingly, methods aimed at improving structural\ncredit assignment can generally be classified into two categories. The first\ncategory includes algorithms that enable coordinated exploration among units,\nsuch as MAP propagation. The second category encompasses algorithms that\ncompute a more specific reward signal for each unit within the network, like\nWeight Maximization and its variants. In this research report, our focus is on\nthe first category. We propose the use of Boltzmann machines or a recurrent\nnetwork for coordinated exploration. We show that the negative phase, which is\ntypically necessary to train Boltzmann machines, can be removed. The resulting\nlearning rules are similar to the reward-modulated Hebbian learning rule.\nExperimental results demonstrate that coordinated exploration significantly\nexceeds independent exploration in training speed for multiple stochastic and\ndiscrete units based on REINFORCE, even surpassing straight-through estimator\n(STE) backpropagation.\n","authors":["Stephen Chung"],"pdf_url":"https://arxiv.org/pdf/2307.13256v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2302.08580v2","updated":"2023-07-25T04:48:18Z","published":"2023-02-16T20:58:09Z","title":"Online Learning Guided Curvature Approximation: A Quasi-Newton Method\n  with Global Non-Asymptotic Superlinear Convergence","summary":"  Quasi-Newton algorithms are among the most popular iterative methods for\nsolving unconstrained minimization problems, largely due to their favorable\nsuperlinear convergence property. However, existing results for these\nalgorithms are limited as they provide either (i) a global convergence\nguarantee with an asymptotic superlinear convergence rate, or (ii) a local\nnon-asymptotic superlinear rate for the case that the initial point and the\ninitial Hessian approximation are chosen properly. In particular, no current\nanalysis for quasi-Newton methods guarantees global convergence with an\nexplicit superlinear convergence rate. In this paper, we close this gap and\npresent the first globally convergent quasi-Newton method with an explicit\nnon-asymptotic superlinear convergence rate. Unlike classical quasi-Newton\nmethods, we build our algorithm upon the hybrid proximal extragradient method\nand propose a novel online learning framework for updating the Hessian\napproximation matrices. Specifically, guided by the convergence analysis, we\nformulate the Hessian approximation update as an online convex optimization\nproblem in the space of matrices, and we relate the bounded regret of the\nonline problem to the superlinear convergence of our method.\n","authors":["Ruichen Jiang","Qiujiang Jin","Aryan Mokhtari"],"pdf_url":"https://arxiv.org/pdf/2302.08580v2.pdf","comment":"33 pages, 1 figure, accepted to COLT 2023"},{"id":"http://arxiv.org/abs/2307.06518v2","updated":"2023-07-25T04:46:56Z","published":"2023-07-13T01:43:28Z","title":"Machine Learning practices and infrastructures","summary":"  Machine Learning (ML) systems, particularly when deployed in high-stakes\ndomains, are deeply consequential. They can exacerbate existing inequities,\ncreate new modes of discrimination, and reify outdated social constructs.\nAccordingly, the social context (i.e. organisations, teams, cultures) in which\nML systems are developed is a site of active research for the field of AI\nethics, and intervention for policymakers. This paper focuses on one aspect of\nsocial context that is often overlooked: interactions between practitioners and\nthe tools they rely on, and the role these interactions play in shaping ML\npractices and the development of ML systems. In particular, through an\nempirical study of questions asked on the Stack Exchange forums, the use of\ninteractive computing platforms (e.g. Jupyter Notebook and Google Colab) in ML\npractices is explored. I find that interactive computing platforms are used in\na host of learning and coordination practices, which constitutes an\ninfrastructural relationship between interactive computing platforms and ML\npractitioners. I describe how ML practices are co-evolving alongside the\ndevelopment of interactive computing platforms, and highlight how this risks\nmaking invisible aspects of the ML life cycle that AI ethics researchers' have\ndemonstrated to be particularly salient for the societal impact of deployed ML\nsystems.\n","authors":["Glen Berman"],"pdf_url":"https://arxiv.org/pdf/2307.06518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13239v1","updated":"2023-07-25T04:04:49Z","published":"2023-07-25T04:04:49Z","title":"RoSAS: Deep Semi-Supervised Anomaly Detection with\n  Contamination-Resilient Continuous Supervision","summary":"  Semi-supervised anomaly detection methods leverage a few anomaly examples to\nyield drastically improved performance compared to unsupervised models.\nHowever, they still suffer from two limitations: 1) unlabeled anomalies (i.e.,\nanomaly contamination) may mislead the learning process when all the unlabeled\ndata are employed as inliers for model training; 2) only discrete supervision\ninformation (such as binary or ordinal data labels) is exploited, which leads\nto suboptimal learning of anomaly scores that essentially take on a continuous\ndistribution. Therefore, this paper proposes a novel semi-supervised anomaly\ndetection method, which devises \\textit{contamination-resilient continuous\nsupervisory signals}. Specifically, we propose a mass interpolation method to\ndiffuse the abnormality of labeled anomalies, thereby creating new data samples\nlabeled with continuous abnormal degrees. Meanwhile, the contaminated area can\nbe covered by new data samples generated via combinations of data with correct\nlabels. A feature learning-based objective is added to serve as an optimization\nconstraint to regularize the network and further enhance the robustness w.r.t.\nanomaly contamination. Extensive experiments on 11 real-world datasets show\nthat our approach significantly outperforms state-of-the-art competitors by\n20%-30% in AUC-PR and obtains more robust and superior performance in settings\nwith different anomaly contamination levels and varying numbers of labeled\nanomalies. The source code is available at https://github.com/xuhongzuo/rosas/.\n","authors":["Hongzuo Xu","Yijie Wang","Guansong Pang","Songlei Jian","Ning Liu","Yongjun Wang"],"pdf_url":"https://arxiv.org/pdf/2307.13239v1.pdf","comment":"Accepted by Information Processing and Management (IP&M)"},{"id":"http://arxiv.org/abs/2307.11768v2","updated":"2023-07-25T04:01:43Z","published":"2023-07-17T00:54:10Z","title":"Question Decomposition Improves the Faithfulness of Model-Generated\n  Reasoning","summary":"  As large language models (LLMs) perform more difficult tasks, it becomes\nharder to verify the correctness and safety of their behavior. One approach to\nhelp with this issue is to prompt LLMs to externalize their reasoning, e.g., by\nhaving them generate step-by-step reasoning as they answer a question\n(Chain-of-Thought; CoT). The reasoning may enable us to check the process that\nmodels use to perform tasks. However, this approach relies on the stated\nreasoning faithfully reflecting the model's actual reasoning, which is not\nalways the case. To improve over the faithfulness of CoT reasoning, we have\nmodels generate reasoning by decomposing questions into subquestions.\nDecomposition-based methods achieve strong performance on question-answering\ntasks, sometimes approaching that of CoT while improving the faithfulness of\nthe model's stated reasoning on several recently-proposed metrics. By forcing\nthe model to answer simpler subquestions in separate contexts, we greatly\nincrease the faithfulness of model-generated reasoning over CoT, while still\nachieving some of the performance gains of CoT. Our results show it is possible\nto improve the faithfulness of model-generated reasoning; continued\nimprovements may lead to reasoning that enables us to verify the correctness\nand safety of LLM behavior.\n","authors":["Ansh Radhakrishnan","Karina Nguyen","Anna Chen","Carol Chen","Carson Denison","Danny Hernandez","Esin Durmus","Evan Hubinger","Jackson Kernion","Kamilė Lukošiūtė","Newton Cheng","Nicholas Joseph","Nicholas Schiefer","Oliver Rausch","Sam McCandlish","Sheer El Showk","Tamera Lanham","Tim Maxwell","Venkatesa Chandrasekaran","Zac Hatfield-Dodds","Jared Kaplan","Jan Brauner","Samuel R. Bowman","Ethan Perez"],"pdf_url":"https://arxiv.org/pdf/2307.11768v2.pdf","comment":"For few-shot examples and prompts, see\n  https://github.com/anthropics/DecompositionFaithfulnessPaper"},{"id":"http://arxiv.org/abs/2307.13236v1","updated":"2023-07-25T03:59:04Z","published":"2023-07-25T03:59:04Z","title":"Audio-aware Query-enhanced Transformer for Audio-Visual Segmentation","summary":"  The goal of the audio-visual segmentation (AVS) task is to segment the\nsounding objects in the video frames using audio cues. However, current\nfusion-based methods have the performance limitations due to the small\nreceptive field of convolution and inadequate fusion of audio-visual features.\nTo overcome these issues, we propose a novel \\textbf{Au}dio-aware\nquery-enhanced \\textbf{TR}ansformer (AuTR) to tackle the task. Unlike existing\nmethods, our approach introduces a multimodal transformer architecture that\nenables deep fusion and aggregation of audio-visual features. Furthermore, we\ndevise an audio-aware query-enhanced transformer decoder that explicitly helps\nthe model focus on the segmentation of the pinpointed sounding objects based on\naudio signals, while disregarding silent yet salient objects. Experimental\nresults show that our method outperforms previous methods and demonstrates\nbetter generalization ability in multi-sound and open-set scenarios.\n","authors":["Jinxiang Liu","Chen Ju","Chaofan Ma","Yanfeng Wang","Yu Wang","Ya Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.13236v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2305.11019"},{"id":"http://arxiv.org/abs/2307.13231v1","updated":"2023-07-25T03:45:56Z","published":"2023-07-25T03:45:56Z","title":"Spectral-DP: Differentially Private Deep Learning through Spectral\n  Perturbation and Filtering","summary":"  Differential privacy is a widely accepted measure of privacy in the context\nof deep learning algorithms, and achieving it relies on a noisy training\napproach known as differentially private stochastic gradient descent (DP-SGD).\nDP-SGD requires direct noise addition to every gradient in a dense neural\nnetwork, the privacy is achieved at a significant utility cost. In this work,\nwe present Spectral-DP, a new differentially private learning approach which\ncombines gradient perturbation in the spectral domain with spectral filtering\nto achieve a desired privacy guarantee with a lower noise scale and thus better\nutility. We develop differentially private deep learning methods based on\nSpectral-DP for architectures that contain both convolution and fully connected\nlayers. In particular, for fully connected layers, we combine a block-circulant\nbased spatial restructuring with Spectral-DP to achieve better utility. Through\ncomprehensive experiments, we study and provide guidelines to implement\nSpectral-DP deep learning on benchmark datasets. In comparison with\nstate-of-the-art DP-SGD based approaches, Spectral-DP is shown to have\nuniformly better utility performance in both training from scratch and transfer\nlearning settings.\n","authors":["Ce Feng","Nuo Xu","Wujie Wen","Parv Venkitasubramaniam","Caiwen Ding"],"pdf_url":"https://arxiv.org/pdf/2307.13231v1.pdf","comment":"Accepted in 2023 IEEE Symposium on Security and Privacy (SP)"},{"id":"http://arxiv.org/abs/2207.11159v2","updated":"2023-07-25T03:24:43Z","published":"2022-07-22T15:55:49Z","title":"Network Revenue Management with Demand Learning and Fair\n  Resource-Consumption Balancing","summary":"  In addition to maximizing the total revenue, decision-makers in lots of\nindustries would like to guarantee balanced consumption across different\nresources. For instance, in the retailing industry, ensuring a balanced\nconsumption of resources from different suppliers enhances fairness and helps\nmain a good channel relationship; in the cloud computing industry,\nresource-consumption balance helps increase customer satisfaction and reduce\noperational costs. Motivated by these practical needs, this paper studies the\nprice-based network revenue management (NRM) problem with both demand learning\nand fair resource-consumption balancing. We introduce the regularized revenue,\ni.e., the total revenue with a balancing regularization, as our objective to\nincorporate fair resource-consumption balancing into the revenue maximization\ngoal. We propose a primal-dual-type online policy with the\nUpper-Confidence-Bound (UCB) demand learning method to maximize the regularized\nrevenue. We adopt several innovative techniques to make our algorithm a unified\nand computationally efficient framework for the continuous price set and a wide\nclass of balancing regularizers. Our algorithm achieves a worst-case regret of\n$\\widetilde O(N^{5/2}\\sqrt{T})$, where $N$ denotes the number of products and\n$T$ denotes the number of time periods. Numerical experiments in a few NRM\nexamples demonstrate the effectiveness of our algorithm in simultaneously\nachieving revenue maximization and fair resource-consumption balancing\n","authors":["Xi Chen","Jiameng Lyu","Yining Wang","Yuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2207.11159v2.pdf","comment":"The original title is Fairness-aware Network Revenue Management With\n  Demand Learning"},{"id":"http://arxiv.org/abs/2307.13219v1","updated":"2023-07-25T03:11:18Z","published":"2023-07-25T03:11:18Z","title":"A Primer on the Data Cleaning Pipeline","summary":"  The availability of both structured and unstructured databases, such as\nelectronic health data, social media data, patent data, and surveys that are\noften updated in real time, among others, has grown rapidly over the past\ndecade. With this expansion, the statistical and methodological questions\naround data integration, or rather merging multiple data sources, has also\ngrown. Specifically, the science of the ``data cleaning pipeline'' contains\nfour stages that allow an analyst to perform downstream tasks, predictive\nanalyses, or statistical analyses on ``cleaned data.'' This article provides a\nreview of this emerging field, introducing technical terminology and commonly\nused methods.\n","authors":["Rebecca C. Steorts"],"pdf_url":"https://arxiv.org/pdf/2307.13219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12438v2","updated":"2023-07-25T03:03:45Z","published":"2023-07-23T21:46:55Z","title":"Multifidelity Covariance Estimation via Regression on the Manifold of\n  Symmetric Positive Definite Matrices","summary":"  We introduce a multifidelity estimator of covariance matrices formulated as\nthe solution to a regression problem on the manifold of symmetric positive\ndefinite matrices. The estimator is positive definite by construction, and the\nMahalanobis distance minimized to obtain it possesses properties which enable\npractical computation. We show that our manifold regression multifidelity\n(MRMF) covariance estimator is a maximum likelihood estimator under a certain\nerror model on manifold tangent space. More broadly, we show that our\nRiemannian regression framework encompasses existing multifidelity covariance\nestimators constructed from control variates. We demonstrate via numerical\nexamples that our estimator can provide significant decreases, up to one order\nof magnitude, in squared estimation error relative to both single-fidelity and\nother multifidelity covariance estimators. Furthermore, preservation of\npositive definiteness ensures that our estimator is compatible with downstream\ntasks, such as data assimilation and metric learning, in which this property is\nessential.\n","authors":["Aimee Maurais","Terrence Alsup","Benjamin Peherstorfer","Youssef Marzouk"],"pdf_url":"https://arxiv.org/pdf/2307.12438v2.pdf","comment":"30 pages + 15-page supplement"},{"id":"http://arxiv.org/abs/2307.13214v1","updated":"2023-07-25T02:55:33Z","published":"2023-07-25T02:55:33Z","title":"FedMEKT: Distillation-based Embedding Knowledge Transfer for Multimodal\n  Federated Learning","summary":"  Federated learning (FL) enables a decentralized machine learning paradigm for\nmultiple clients to collaboratively train a generalized global model without\nsharing their private data. Most existing works simply propose typical FL\nsystems for single-modal data, thus limiting its potential on exploiting\nvaluable multimodal data for future personalized applications. Furthermore, the\nmajority of FL approaches still rely on the labeled data at the client side,\nwhich is limited in real-world applications due to the inability of\nself-annotation from users. In light of these limitations, we propose a novel\nmultimodal FL framework that employs a semi-supervised learning approach to\nleverage the representations from different modalities. Bringing this concept\ninto a system, we develop a distillation-based multimodal embedding knowledge\ntransfer mechanism, namely FedMEKT, which allows the server and clients to\nexchange the joint knowledge of their learning models extracted from a small\nmultimodal proxy dataset. Our FedMEKT iteratively updates the generalized\nglobal encoders with the joint embedding knowledge from the participating\nclients. Thereby, to address the modality discrepancy and labeled data\nconstraint in existing FL systems, our proposed FedMEKT comprises local\nmultimodal autoencoder learning, generalized multimodal autoencoder\nconstruction, and generalized classifier learning. Through extensive\nexperiments on three multimodal human activity recognition datasets, we\ndemonstrate that FedMEKT achieves superior global encoder performance on linear\nevaluation and guarantees user privacy for personal data and model parameters\nwhile demanding less communication cost than other baselines.\n","authors":["Huy Q. Le","Minh N. H. Nguyen","Chu Myaet Thwal","Yu Qiao","Chaoning Zhang","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2307.13214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.03332v2","updated":"2023-07-25T02:44:54Z","published":"2021-05-07T15:33:49Z","title":"Finite volume method network for acceleration of unsteady computational\n  fluid dynamics: non-reacting and reacting flows","summary":"  Despite rapid improvements in the performance of central processing unit\n(CPU), the calculation cost of simulating chemically reacting flow using CFD\nremains infeasible in many cases. The application of the convolutional neural\nnetworks (CNNs) specialized in image processing in flow field prediction has\nbeen studied, but the need to develop a neural netweork design fitted for CFD\nis recently emerged. In this study, a neural network model introducing the\nfinite volume method (FVM) with a unique network architecture and\nphysics-informed loss function was developed to accelerate CFD simulations. The\ndeveloped network model, considering the nature of the CFD flow field where the\nidentical governing equations are applied to all grids, can predict the future\nfields with only two previous fields unlike the CNNs requiring many field\nimages (>10,000). The performance of this baseline model was evaluated using\nCFD time series data from non-reacting flow and reacting flow simulation;\ncounterflow and hydrogen flame with 20 detailed chemistries. Consequently, we\ndemonstrated that (1) the FVM-based network architecture provided improved\naccuracy of multistep time series prediction compared to the previous MLP model\n(2) the physic-informed loss function prevented non-physical overfitting\nproblem and ultimately reduced the error in time series prediction (3)\nobserving the calculated residuals in an unsupervised manner could indirectly\nestimate the network accuracy. Additionally, under the reacting flow dataset,\nthe computational speed of this network model was measured to be about 10 times\nfaster than that of the CFD solver.\n","authors":["Joongoo Jeon","Juhyeong Lee","Sung Joong Kim"],"pdf_url":"https://arxiv.org/pdf/2105.03332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13434v2","updated":"2023-07-25T02:24:04Z","published":"2023-02-26T23:02:33Z","title":"Spatial-temporal Transformer-guided Diffusion based Data Augmentation\n  for Efficient Skeleton-based Action Recognition","summary":"  Recently, skeleton-based human action has become a hot research topic because\nthe compact representation of human skeletons brings new blood to this research\ndomain. As a result, researchers began to notice the importance of using RGB or\nother sensors to analyze human action by extracting skeleton information.\nLeveraging the rapid development of deep learning (DL), a significant number of\nskeleton-based human action approaches have been presented with fine-designed\nDL structures recently. However, a well-trained DL model always demands\nhigh-quality and sufficient data, which is hard to obtain without costing high\nexpenses and human labor. In this paper, we introduce a novel data augmentation\nmethod for skeleton-based action recognition tasks, which can effectively\ngenerate high-quality and diverse sequential actions. In order to obtain\nnatural and realistic action sequences, we propose denoising diffusion\nprobabilistic models (DDPMs) that can generate a series of synthetic action\nsequences, and their generation process is precisely guided by a\nspatial-temporal transformer (ST-Trans). Experimental results show that our\nmethod outperforms the state-of-the-art (SOTA) motion generation approaches on\ndifferent naturality and diversity metrics. It proves that its high-quality\nsynthetic data can also be effectively deployed to existing action recognition\nmodels with significant performance improvement.\n","authors":["Yifan Jiang","Han Chen","Hanseok Ko"],"pdf_url":"https://arxiv.org/pdf/2302.13434v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13206v1","updated":"2023-07-25T02:11:41Z","published":"2023-07-25T02:11:41Z","title":"Transferability of Graph Neural Networks using Graphon and Sampling\n  Theories","summary":"  Graph neural networks (GNNs) have become powerful tools for processing\ngraph-based information in various domains. A desirable property of GNNs is\ntransferability, where a trained network can swap in information from a\ndifferent graph without retraining and retain its accuracy. A recent method of\ncapturing transferability of GNNs is through the use of graphons, which are\nsymmetric, measurable functions representing the limit of large dense graphs.\nIn this work, we contribute to the application of graphons to GNNs by\npresenting an explicit two-layer graphon neural network (WNN) architecture. We\nprove its ability to approximate bandlimited signals within a specified error\ntolerance using a minimal number of network weights. We then leverage this\nresult, to establish the transferability of an explicit two-layer GNN over all\nsufficiently large graphs in a sequence converging to a graphon. Our work\naddresses transferability between both deterministic weighted graphs and simple\nrandom graphs and overcomes issues related to the curse of dimensionality that\narise in other GNN results. The proposed WNN and GNN architectures offer\npractical solutions for handling graph data of varying sizes while maintaining\nperformance guarantees without extensive retraining.\n","authors":["A. Martina Neuman","Jason J. Bramburger"],"pdf_url":"https://arxiv.org/pdf/2307.13206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13127v2","updated":"2023-07-25T01:54:26Z","published":"2023-05-18T20:15:04Z","title":"What Symptoms and How Long? An Interpretable AI Approach for Depression\n  Detection in Social Media","summary":"  Depression is the most prevalent and serious mental illness, which induces\ngrave financial and societal ramifications. Depression detection is key for\nearly intervention to mitigate those consequences. Such a high-stake decision\ninherently necessitates interpretability. Although a few depression detection\nstudies attempt to explain the decision based on the importance score or\nattention weights, these explanations misalign with the clinical depression\ndiagnosis criterion that is based on depressive symptoms. To fill this gap, we\nfollow the computational design science paradigm to develop a novel Multi-Scale\nTemporal Prototype Network (MSTPNet). MSTPNet innovatively detects and\ninterprets depressive symptoms as well as how long they last. Extensive\nempirical analyses using a large-scale dataset show that MSTPNet outperforms\nstate-of-the-art depression detection methods with an F1-score of 0.851. This\nresult also reveals new symptoms that are unnoted in the survey approach, such\nas sharing admiration for a different life. We further conduct a user study to\ndemonstrate its superiority over the benchmarks in interpretability. This study\ncontributes to IS literature with a novel interpretable deep learning model for\ndepression detection in social media. In practice, our proposed method can be\nimplemented in social media platforms to provide personalized online resources\nfor detected depressed patients.\n","authors":["Junwei Kuang","Jiaheng Xie","Zhijun Yan"],"pdf_url":"https://arxiv.org/pdf/2305.13127v2.pdf","comment":"56 pages, 10 figures, 21 tables"},{"id":"http://arxiv.org/abs/2303.17743v2","updated":"2023-07-25T01:50:39Z","published":"2023-03-30T23:30:42Z","title":"FairGen: Towards Fair Graph Generation","summary":"  There have been tremendous efforts over the past decades dedicated to the\ngeneration of realistic graphs in a variety of domains, ranging from social\nnetworks to computer networks, from gene regulatory networks to online\ntransaction networks. Despite the remarkable success, the vast majority of\nthese works are unsupervised in nature and are typically trained to minimize\nthe expected graph reconstruction loss, which would result in the\nrepresentation disparity issue in the generated graphs, i.e., the protected\ngroups (often minorities) contribute less to the objective and thus suffer from\nsystematically higher errors. In this paper, we aim to tailor graph generation\nto downstream mining tasks by leveraging label information and user-preferred\nparity constraint. In particular, we start from the investigation of\nrepresentation disparity in the context of graph generative models. To mitigate\nthe disparity, we propose a fairness-aware graph generative model named\nFairGen. Our model jointly trains a label-informed graph generation module and\na fair representation learning module by progressively learning the behaviors\nof the protected and unprotected groups, from the `easy' concepts to the `hard'\nones. In addition, we propose a generic context sampling strategy for graph\ngenerative models, which is proven to be capable of fairly capturing the\ncontextual information of each group with a high probability. Experimental\nresults on seven real-world data sets, including web-based graphs, demonstrate\nthat FairGen (1) obtains performance on par with state-of-the-art graph\ngenerative models across six network properties, (2) mitigates the\nrepresentation disparity issues in the generated graphs, and (3) substantially\nboosts the model performance by up to 17% in downstream tasks via data\naugmentation.\n","authors":["Lecheng Zheng","Dawei Zhou","Hanghang Tong","Jiejun Xu","Yada Zhu","Jingrui He"],"pdf_url":"https://arxiv.org/pdf/2303.17743v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13199v1","updated":"2023-07-25T01:35:37Z","published":"2023-07-25T01:35:37Z","title":"An Investigation into Glomeruli Detection in Kidney H&E and PAS Images\n  using YOLO","summary":"  Context: Analyzing digital pathology images is necessary to draw diagnostic\nconclusions by investigating tissue patterns and cellular morphology. However,\nmanual evaluation can be time-consuming, expensive, and prone to inter- and\nintra-observer variability. Objective: To assist pathologists using\ncomputerized solutions, automated tissue structure detection and segmentation\nmust be proposed. Furthermore, generating pixel-level object annotations for\nhistopathology images is expensive and time-consuming. As a result, detection\nmodels with bounding box labels may be a feasible solution. Design: This paper\nstudies. YOLO-v4 (You-Only-Look-Once), a real-time object detector for\nmicroscopic images. YOLO uses a single neural network to predict several\nbounding boxes and class probabilities for objects of interest. YOLO can\nenhance detection performance by training on whole slide images. YOLO-v4 has\nbeen used in this paper. for glomeruli detection in human kidney images.\nMultiple experiments have been designed and conducted based on different\ntraining data of two public datasets and a private dataset from the University\nof Michigan for fine-tuning the model. The model was tested on the private\ndataset from the University of Michigan, serving as an external validation of\ntwo different stains, namely hematoxylin and eosin (H&E) and periodic\nacid-Schiff (PAS). Results: Average specificity and sensitivity for all\nexperiments, and comparison of existing segmentation methods on the same\ndatasets are discussed. Conclusions: Automated glomeruli detection in human\nkidney images is possible using modern AI models. The design and validation for\ndifferent stains still depends on variability of public multi-stain datasets.\n","authors":["Kimia Hemmatirad","Morteza Babaie","Jeffrey Hodgin","Liron Pantanowitz","H. R. Tizhoosh"],"pdf_url":"https://arxiv.org/pdf/2307.13199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13192v1","updated":"2023-07-25T01:14:56Z","published":"2023-07-25T01:14:56Z","title":"Counterfactual Explanation Policies in RL","summary":"  As Reinforcement Learning (RL) agents are increasingly employed in diverse\ndecision-making problems using reward preferences, it becomes important to\nensure that policies learned by these frameworks in mapping observations to a\nprobability distribution of the possible actions are explainable. However,\nthere is little to no work in the systematic understanding of these complex\npolicies in a contrastive manner, i.e., what minimal changes to the policy\nwould improve/worsen its performance to a desired level. In this work, we\npresent COUNTERPOL, the first framework to analyze RL policies using\ncounterfactual explanations in the form of minimal changes to the policy that\nlead to the desired outcome. We do so by incorporating counterfactuals in\nsupervised learning in RL with the target outcome regulated using desired\nreturn. We establish a theoretical connection between Counterpol and widely\nused trust region-based policy optimization methods in RL. Extensive empirical\nanalysis shows the efficacy of COUNTERPOL in generating explanations for\n(un)learning skills while keeping close to the original policy. Our results on\nfive different RL environments with diverse state and action spaces demonstrate\nthe utility of counterfactual explanations, paving the way for new frontiers in\ndesigning and developing counterfactual policies.\n","authors":["Shripad V. Deshmukh","Srivatsan R","Supriti Vijay","Jayakumar Subramanian","Chirag Agarwal"],"pdf_url":"https://arxiv.org/pdf/2307.13192v1.pdf","comment":"ICML Workshop on Counterfactuals in Minds and Machines, 2023"},{"id":"http://arxiv.org/abs/2302.03693v2","updated":"2023-07-25T00:36:06Z","published":"2023-02-07T20:43:48Z","title":"Concept Algebra for Score-Based Conditional Models","summary":"  This paper concerns the structure of learned representations in text-guided\ngenerative models, focusing on score-based models. Here, we focus on the idea\nthat concepts are encoded as subspaces (or directions) of some representation\nspace. We develop a mathematical formalization of this idea.Using this\nformalism, we show there's a natural choice of representation with this\nproperty, and we develop a simple method for identifying the part of the\nrepresentation corresponding to a given concept. In particular, this allows us\nto manipulate the concepts expressed by the model through algebraic\nmanipulation of the representation. We demonstrate the idea with examples\ntext-guided image generation, using Stable Diffusion.\n","authors":["Zihao Wang","Lin Gui","Jeffrey Negrea","Victor Veitch"],"pdf_url":"https://arxiv.org/pdf/2302.03693v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02034v2","updated":"2023-07-25T00:19:04Z","published":"2023-03-03T15:52:06Z","title":"Linear CNNs Discover the Statistical Structure of the Dataset Using Only\n  the Most Dominant Frequencies","summary":"  We here present a stepping stone towards a deeper understanding of\nconvolutional neural networks (CNNs) in the form of a theory of learning in\nlinear CNNs. Through analyzing the gradient descent equations, we discover that\nthe evolution of the network during training is determined by the interplay\nbetween the dataset structure and the convolutional network structure. We show\nthat linear CNNs discover the statistical structure of the dataset with\nnon-linear, ordered, stage-like transitions, and that the speed of discovery\nchanges depending on the relationship between the dataset and the convolutional\nnetwork structure. Moreover, we find that this interplay lies at the heart of\nwhat we call the ``dominant frequency bias'', where linear CNNs arrive at these\ndiscoveries using only the dominant frequencies of the different structural\nparts present in the dataset. We furthermore provide experiments that show how\nour theory relates to deep, non-linear CNNs used in practice. Our findings shed\nnew light on the inner working of CNNs, and can help explain their shortcut\nlearning and their tendency to rely on texture instead of shape.\n","authors":["Hannah Pinson","Joeri Lenaerts","Vincent Ginis"],"pdf_url":"https://arxiv.org/pdf/2303.02034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13181v1","updated":"2023-07-25T00:01:10Z","published":"2023-07-25T00:01:10Z","title":"Neural Memory Decoding with EEG Data and Representation Learning","summary":"  We describe a method for the neural decoding of memory from EEG data. Using\nthis method, a concept being recalled can be identified from an EEG trace with\nan average top-1 accuracy of about 78.4% (chance 4%). The method employs deep\nrepresentation learning with supervised contrastive loss to map an EEG\nrecording of brain activity to a low-dimensional space. Because representation\nlearning is used, concepts can be identified even if they do not appear in the\ntraining data set. However, reference EEG data must exist for each such\nconcept. We also show an application of the method to the problem of\ninformation retrieval. In neural information retrieval, EEG data is captured\nwhile a user recalls the contents of a document, and a list of links to\npredicted documents is produced.\n","authors":["Glenn Bruns","Michael Haidar","Federico Rubino"],"pdf_url":"https://arxiv.org/pdf/2307.13181v1.pdf","comment":"18 pages, 18 figures"},{"id":"http://arxiv.org/abs/2307.04056v2","updated":"2023-07-25T23:49:11Z","published":"2023-07-08T23:19:53Z","title":"Manifold Filter-Combine Networks","summary":"  We introduce a class of manifold neural networks (MNNs) that we call Manifold\nFilter-Combine Networks (MFCNs), that aims to further our understanding of\nMNNs, analogous to how the aggregate-combine framework helps with the\nunderstanding of graph neural networks (GNNs). This class includes a wide\nvariety of subclasses that can be thought of as the manifold analog of various\npopular GNNs. We then consider a method, based on building a data-driven graph,\nfor implementing such networks when one does not have global knowledge of the\nmanifold, but merely has access to finitely many sample points. We provide\nsufficient conditions for the network to provably converge to its continuum\nlimit as the number of sample points tends to infinity. Unlike previous work\n(which focused on specific graph constructions), our rate of convergence does\nnot directly depend on the number of filters used. Moreover, it exhibits linear\ndependence on the depth of the network rather than the exponential dependence\nobtained previously. Additionally, we provide several examples of interesting\nsubclasses of MFCNs and of the rates of convergence that are obtained under\nspecific graph constructions.\n","authors":["Joyce Chew","Edward De Brouwer","Smita Krishnaswamy","Deanna Needell","Michael Perlmutter"],"pdf_url":"https://arxiv.org/pdf/2307.04056v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13865v1","updated":"2023-07-25T23:46:48Z","published":"2023-07-25T23:46:48Z","title":"Pretrained Deep 2.5D Models for Efficient Predictive Modeling from\n  Retinal OCT","summary":"  In the field of medical imaging, 3D deep learning models play a crucial role\nin building powerful predictive models of disease progression. However, the\nsize of these models presents significant challenges, both in terms of\ncomputational resources and data requirements. Moreover, achieving high-quality\npretraining of 3D models proves to be even more challenging. To address these\nissues, hybrid 2.5D approaches provide an effective solution for utilizing 3D\nvolumetric data efficiently using 2D models. Combining 2D and 3D techniques\noffers a promising avenue for optimizing performance while minimizing memory\nrequirements. In this paper, we explore 2.5D architectures based on a\ncombination of convolutional neural networks (CNNs), long short-term memory\n(LSTM), and Transformers. In addition, leveraging the benefits of recent\nnon-contrastive pretraining approaches in 2D, we enhanced the performance and\ndata efficiency of 2.5D techniques even further. We demonstrate the\neffectiveness of architectures and associated pretraining on a task of\npredicting progression to wet age-related macular degeneration (AMD) within a\nsix-month period on two large longitudinal OCT datasets.\n","authors":["Taha Emre","Marzieh Oghbaie","Arunava Chakravarty","Antoine Rivail","Sophie Riedl","Julia Mai","Hendrik P. N. Scholl","Sobha Sivaprasad","Daniel Rueckert","Andrew Lotery","Ursula Schmidt-Erfurth","Hrvoje Bogunović"],"pdf_url":"https://arxiv.org/pdf/2307.13865v1.pdf","comment":"Accepted at OMIA-X MICCAI'23 Workshop"},{"id":"http://arxiv.org/abs/2305.11990v2","updated":"2023-07-25T23:43:35Z","published":"2023-05-19T20:30:59Z","title":"Productive Crop Field Detection: A New Dataset and Deep Learning\n  Benchmark Results","summary":"  In precision agriculture, detecting productive crop fields is an essential\npractice that allows the farmer to evaluate operating performance separately\nand compare different seed varieties, pesticides, and fertilizers. However,\nmanually identifying productive fields is often a time-consuming and\nerror-prone task. Previous studies explore different methods to detect crop\nfields using advanced machine learning algorithms, but they often lack good\nquality labeled data. In this context, we propose a high-quality dataset\ngenerated by machine operation combined with Sentinel-2 images tracked over\ntime. As far as we know, it is the first one to overcome the lack of labeled\nsamples by using this technique. In sequence, we apply a semi-supervised\nclassification of unlabeled data and state-of-the-art supervised and\nself-supervised deep learning methods to detect productive crop fields\nautomatically. Finally, the results demonstrate high accuracy in Positive\nUnlabeled learning, which perfectly fits the problem where we have high\nconfidence in the positive samples. Best performances have been found in\nTriplet Loss Siamese given the existence of an accurate dataset and Contrastive\nLearning considering situations where we do not have a comprehensive labeled\ndataset available.\n","authors":["Eduardo Nascimento","John Just","Jurandy Almeida","Tiago Almeida"],"pdf_url":"https://arxiv.org/pdf/2305.11990v2.pdf","comment":"Preprint of the paper https://doi.org/10.1109/lgrs.2023.3296064\n  published in IEEE Geoscience and Remote Sensing Letters"},{"id":"http://arxiv.org/abs/2305.19442v3","updated":"2023-07-25T23:27:28Z","published":"2023-05-30T22:30:30Z","title":"SimFBO: Towards Simple, Flexible and Communication-efficient Federated\n  Bilevel Learning","summary":"  Federated bilevel optimization (FBO) has shown great potential recently in\nmachine learning and edge computing due to the emerging nested optimization\nstructure in meta-learning, fine-tuning, hyperparameter tuning, etc. However,\nexisting FBO algorithms often involve complicated computations and require\nmultiple sub-loops per iteration, each of which contains a number of\ncommunication rounds. In this paper, we propose a simple and flexible FBO\nframework named SimFBO, which is easy to implement without sub-loops, and\nincludes a generalized server-side aggregation and update for improving\ncommunication efficiency. We further propose System-level heterogeneity robust\nFBO (ShroFBO) as a variant of SimFBO with stronger resilience to heterogeneous\nlocal computation. We show that SimFBO and ShroFBO provably achieve a linear\nconvergence speedup with partial client participation and client sampling\nwithout replacement, as well as improved sample and communication complexities.\nExperiments demonstrate the effectiveness of the proposed methods over existing\nFBO algorithms.\n","authors":["Yifan Yang","Peiyao Xiao","Kaiyi Ji"],"pdf_url":"https://arxiv.org/pdf/2305.19442v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13861v1","updated":"2023-07-25T23:25:05Z","published":"2023-07-25T23:25:05Z","title":"Learning to Design Analog Circuits to Meet Threshold Specifications","summary":"  Automated design of analog and radio-frequency circuits using supervised or\nreinforcement learning from simulation data has recently been studied as an\nalternative to manual expert design. It is straightforward for a design agent\nto learn an inverse function from desired performance metrics to circuit\nparameters. However, it is more common for a user to have threshold performance\ncriteria rather than an exact target vector of feasible performance measures.\nIn this work, we propose a method for generating from simulation data a dataset\non which a system can be trained via supervised learning to design circuits to\nmeet threshold specifications. We moreover perform the to-date most extensive\nevaluation of automated analog circuit design, including experimenting in a\nsignificantly more diverse set of circuits than in prior work, covering linear,\nnonlinear, and autonomous circuit configurations, and show that our method\nconsistently reaches success rate better than 90% at 5% error margin, while\nalso improving data efficiency by upward of an order of magnitude. A demo of\nthis system is available at circuits.streamlit.app\n","authors":["Dmitrii Krylov","Pooya Khajeh","Junhan Ouyang","Thomas Reeves","Tongkai Liu","Hiba Ajmal","Hamidreza Aghasi","Roy Fox"],"pdf_url":"https://arxiv.org/pdf/2307.13861v1.pdf","comment":"in proceedings of ICML 23"},{"id":"http://arxiv.org/abs/2307.13856v1","updated":"2023-07-25T23:09:05Z","published":"2023-07-25T23:09:05Z","title":"On the unreasonable vulnerability of transformers for image restoration\n  -- and an easy fix","summary":"  Following their success in visual recognition tasks, Vision\nTransformers(ViTs) are being increasingly employed for image restoration. As a\nfew recent works claim that ViTs for image classification also have better\nrobustness properties, we investigate whether the improved adversarial\nrobustness of ViTs extends to image restoration. We consider the recently\nproposed Restormer model, as well as NAFNet and the \"Baseline network\" which\nare both simplified versions of a Restormer. We use Projected Gradient Descent\n(PGD) and CosPGD, a recently proposed adversarial attack tailored to pixel-wise\nprediction tasks for our robustness evaluation. Our experiments are performed\non real-world images from the GoPro dataset for image deblurring. Our analysis\nindicates that contrary to as advocated by ViTs in image classification works,\nthese models are highly susceptible to adversarial attacks. We attempt to\nimprove their robustness through adversarial training. While this yields a\nsignificant increase in robustness for Restormer, results on other networks are\nless promising. Interestingly, the design choices in NAFNet and Baselines,\nwhich were based on iid performance, and not on robust generalization, seem to\nbe at odds with the model robustness. Thus, we investigate this further and\nfind a fix.\n","authors":["Shashank Agnihotri","Kanchana Vaishnavi Gandikota","Julia Grabinski","Paramanand Chandramouli","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2307.13856v1.pdf","comment":"Tags: Robustness, adversarial attacks, image deblurring, image\n  restoration, NAFNet, Baseline, Restormer, adversarial training"},{"id":"http://arxiv.org/abs/2307.13855v1","updated":"2023-07-25T23:02:35Z","published":"2023-07-25T23:02:35Z","title":"Exploring the Sharpened Cosine Similarity","summary":"  Convolutional layers have long served as the primary workhorse for image\nclassification. Recently, an alternative to convolution was proposed using the\nSharpened Cosine Similarity (SCS), which in theory may serve as a better\nfeature detector. While multiple sources report promising results, there has\nnot been to date a full-scale empirical analysis of neural network performance\nusing these new layers. In our work, we explore SCS's parameter behavior and\npotential as a drop-in replacement for convolutions in multiple CNN\narchitectures benchmarked on CIFAR-10. We find that while SCS may not yield\nsignificant increases in accuracy, it may learn more interpretable\nrepresentations. We also find that, in some circumstances, SCS may confer a\nslight increase in adversarial robustness.\n","authors":["Skyler Wu","Fred Lu","Edward Raff","James Holt"],"pdf_url":"https://arxiv.org/pdf/2307.13855v1.pdf","comment":"Accepted to I Can't Believe It's Not Better Workshop (ICBINB) at\n  NeurIPS 2022"},{"id":"http://arxiv.org/abs/2307.13854v1","updated":"2023-07-25T22:59:32Z","published":"2023-07-25T22:59:32Z","title":"WebArena: A Realistic Web Environment for Building Autonomous Agents","summary":"  With generative AI advances, the exciting potential for autonomous agents to\nmanage daily tasks via natural language commands has emerged. However, cur rent\nagents are primarily created and tested in simplified synthetic environments,\nsubstantially limiting real-world scenario representation. In this paper, we\nbuild an environment for agent command and control that is highly realistic and\nreproducible. Specifically, we focus on agents that perform tasks on websites,\nand we create an environment with fully functional websites from four common\ndomains: e-commerce, social forum discussions, collaborative software\ndevelopment, and content management. Our environment is enriched with tools\n(e.g., a map) and external knowledge bases (e.g., user manuals) to encourage\nhuman-like task-solving. Building upon our environment, we release a set of\nbenchmark tasks focusing on evaluating the functional correctness of task\ncompletions. The tasks in our benchmark are diverse, long-horizon, and are\ndesigned to emulate tasks that humans routinely perform on the internet. We\ndesign and implement several autonomous agents, integrating recent techniques\nsuch as reasoning before acting. The results demonstrate that solving complex\ntasks is challenging: our best GPT-4-based agent only achieves an end-to-end\ntask success rate of 10.59%. These results highlight the need for further\ndevelopment of robust agents, that current state-of-the-art LMs are far from\nperfect performance in these real-life tasks, and that WebArena can be used to\nmeasure such progress. Our code, data, environment reproduction resources, and\nvideo demonstrations are publicly available at https://webarena.dev/.\n","authors":["Shuyan Zhou","Frank F. Xu","Hao Zhu","Xuhui Zhou","Robert Lo","Abishek Sridhar","Xianyi Cheng","Yonatan Bisk","Daniel Fried","Uri Alon","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2307.13854v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2307.13851v1","updated":"2023-07-25T22:54:47Z","published":"2023-07-25T22:54:47Z","title":"SplitFed resilience to packet loss: Where to split, that is the question","summary":"  Decentralized machine learning has broadened its scope recently with the\ninvention of Federated Learning (FL), Split Learning (SL), and their hybrids\nlike Split Federated Learning (SplitFed or SFL). The goal of SFL is to reduce\nthe computational power required by each client in FL and parallelize SL while\nmaintaining privacy. This paper investigates the robustness of SFL against\npacket loss on communication links. The performance of various SFL aggregation\nstrategies is examined by splitting the model at two points -- shallow split\nand deep split -- and testing whether the split point makes a statistically\nsignificant difference to the accuracy of the final model. Experiments are\ncarried out on a segmentation model for human embryo images and indicate the\nstatistically significant advantage of a deeper split point.\n","authors":["Chamani Shiranthika","Zahra Hafezi Kafshgari","Parvaneh Saeedi","Ivan V. Bajić"],"pdf_url":"https://arxiv.org/pdf/2307.13851v1.pdf","comment":"10 pages, 4 figures, MICCAI 2023 Workshop on Distributed,\n  Collaborative and Federated Learning"},{"id":"http://arxiv.org/abs/2210.07612v2","updated":"2023-07-25T22:51:42Z","published":"2022-10-14T08:09:33Z","title":"Monotonicity and Double Descent in Uncertainty Estimation with Gaussian\n  Processes","summary":"  Despite their importance for assessing reliability of predictions,\nuncertainty quantification (UQ) measures for machine learning models have only\nrecently begun to be rigorously characterized. One prominent issue is the curse\nof dimensionality: it is commonly believed that the marginal likelihood should\nbe reminiscent of cross-validation metrics and that both should deteriorate\nwith larger input dimensions. We prove that by tuning hyperparameters to\nmaximize marginal likelihood (the empirical Bayes procedure), the performance,\nas measured by the marginal likelihood, improves monotonically} with the input\ndimension. On the other hand, we prove that cross-validation metrics exhibit\nqualitatively different behavior that is characteristic of double descent. Cold\nposteriors, which have recently attracted interest due to their improved\nperformance in certain settings, appear to exacerbate these phenomena. We\nverify empirically that our results hold for real data, beyond our considered\nassumptions, and we explore consequences involving synthetic covariates.\n","authors":["Liam Hodgkinson","Chris van der Heide","Fred Roosta","Michael W. Mahoney"],"pdf_url":"https://arxiv.org/pdf/2210.07612v2.pdf","comment":"33 pages, 21 figures"},{"id":"http://arxiv.org/abs/2307.13850v1","updated":"2023-07-25T22:51:36Z","published":"2023-07-25T22:51:36Z","title":"MAEA: Multimodal Attribution for Embodied AI","summary":"  Understanding multimodal perception for embodied AI is an open question\nbecause such inputs may contain highly complementary as well as redundant\ninformation for the task. A relevant direction for multimodal policies is\nunderstanding the global trends of each modality at the fusion layer. To this\nend, we disentangle the attributions for visual, language, and previous action\ninputs across different policies trained on the ALFRED dataset. Attribution\nanalysis can be utilized to rank and group the failure scenarios, investigate\nmodeling and dataset biases, and critically analyze multimodal EAI policies for\nrobustness and user trust before deployment. We present MAEA, a framework to\ncompute global attributions per modality of any differentiable policy. In\naddition, we show how attributions enable lower-level behavior analysis in EAI\npolicies for language and visual attributions.\n","authors":["Vidhi Jain","Jayant Sravan Tamarapalli","Sahiti Yerramilli","Yonatan Bisk"],"pdf_url":"https://arxiv.org/pdf/2307.13850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12171v2","updated":"2023-07-25T22:18:33Z","published":"2023-07-22T21:36:03Z","title":"Learn to Compress (LtC): Efficient Learning-based Streaming Video\n  Analytics","summary":"  Video analytics are often performed as cloud services in edge settings,\nmainly to offload computation, and also in situations where the results are not\ndirectly consumed at the video sensors. Sending high-quality video data from\nthe edge devices can be expensive both in terms of bandwidth and power use. In\norder to build a streaming video analytics pipeline that makes efficient use of\nthese resources, it is therefore imperative to reduce the size of the video\nstream. Traditional video compression algorithms are unaware of the semantics\nof the video, and can be both inefficient and harmful for the analytics\nperformance. In this paper, we introduce LtC, a collaborative framework between\nthe video source and the analytics server, that efficiently learns to reduce\nthe video streams within an analytics pipeline. Specifically, LtC uses the\nfull-fledged analytics algorithm at the server as a teacher to train a\nlightweight student neural network, which is then deployed at the video source.\nThe student network is trained to comprehend the semantic significance of\nvarious regions within the videos, which is used to differentially preserve the\ncrucial regions in high quality while the remaining regions undergo aggressive\ncompression. Furthermore, LtC also incorporates a novel temporal filtering\nalgorithm based on feature-differencing to omit transmitting frames that do not\ncontribute new information. Overall, LtC is able to use 28-35% less bandwidth\nand has up to 45% shorter response delay compared to recently published state\nof the art streaming frameworks while achieving similar analytics performance.\n","authors":["Quazi Mishkatul Alam","Israat Haque","Nael Abu-Ghazaleh"],"pdf_url":"https://arxiv.org/pdf/2307.12171v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12004v2","updated":"2023-07-25T22:11:24Z","published":"2023-02-08T23:31:35Z","title":"Knowledge Distillation-based Information Sharing for Online Process\n  Monitoring in Decentralized Manufacturing System","summary":"  In advanced manufacturing, the incorporation of sensing technology provides\nan opportunity to achieve efficient in-situ process monitoring using machine\nlearning methods. Meanwhile, the advances of information technologies also\nenable a connected and decentralized environment for manufacturing systems,\nmaking different manufacturing units in the system collaborate more closely. In\na decentralized manufacturing system, the involved units may fabricate same or\nsimilar products and deploy their own machine learning model for online process\nmonitoring. However, due to the possible inconsistency of task progress during\nthe operation, it is also common that some units have more informative data\nwhile some have less informative data. Thus, the monitoring performance of\nmachine learning model for each unit may highly vary. Therefore, it is\nextremely valuable to achieve efficient and secured knowledge sharing among the\nunits in a decentralized manufacturing system for enhancement of poorly\nperformed models. To realize this goal, this paper proposes a novel knowledge\ndistillation-based information sharing (KD-IS) framework, which could distill\ninformative knowledge from well performed models to improve the monitoring\nperformance of poorly performed models. To validate the effectiveness of this\nmethod, a real-world case study is conducted in a connected fused filament\nfabrication (FFF)-based additive manufacturing (AM) platform. The experimental\nresults show that the developed method is very efficient in improving model\nmonitoring performance at poorly performed models, with solid protection on\npotential data privacy.\n","authors":["Zhangyue Shi","Yuxuan Li","Chenang Liu"],"pdf_url":"https://arxiv.org/pdf/2302.12004v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13831v1","updated":"2023-07-25T21:59:17Z","published":"2023-07-25T21:59:17Z","title":"Relationship between Batch Size and Number of Steps Needed for Nonconvex\n  Optimization of Stochastic Gradient Descent using Armijo Line Search","summary":"  Stochastic gradient descent (SGD) is the simplest deep learning optimizer\nwith which to train deep neural networks. While SGD can use various learning\nrates, such as constant or diminishing rates, the previous numerical results\nshowed that SGD performs better than other deep learning optimizers using when\nit uses learning rates given by line search methods. In this paper, we perform\na convergence analysis on SGD with a learning rate given by an Armijo line\nsearch for nonconvex optimization. The analysis indicates that the upper bound\nof the expectation of the squared norm of the full gradient becomes small when\nthe number of steps and the batch size are large. Next, we show that, for SGD\nwith the Armijo-line-search learning rate, the number of steps needed for\nnonconvex optimization is a monotone decreasing convex function of the batch\nsize; that is, the number of steps needed for nonconvex optimization decreases\nas the batch size increases. Furthermore, we show that the stochastic\nfirst-order oracle (SFO) complexity, which is the stochastic gradient\ncomputation cost, is a convex function of the batch size; that is, there exists\na critical batch size that minimizes the SFO complexity. Finally, we provide\nnumerical results that support our theoretical results. The numerical results\nindicate that the number of steps needed for training deep neural networks\ndecreases as the batch size increases and that there exist the critical batch\nsizes that can be estimated from the theoretical results.\n","authors":["Yuki Tsukada","Hideaki Iiduka"],"pdf_url":"https://arxiv.org/pdf/2307.13831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13824v1","updated":"2023-07-25T21:38:08Z","published":"2023-07-25T21:38:08Z","title":"Offline Reinforcement Learning with On-Policy Q-Function Regularization","summary":"  The core challenge of offline reinforcement learning (RL) is dealing with the\n(potentially catastrophic) extrapolation error induced by the distribution\nshift between the history dataset and the desired policy. A large portion of\nprior work tackles this challenge by implicitly/explicitly regularizing the\nlearning policy towards the behavior policy, which is hard to estimate reliably\nin practice. In this work, we propose to regularize towards the Q-function of\nthe behavior policy instead of the behavior policy itself, under the premise\nthat the Q-function can be estimated more reliably and easily by a SARSA-style\nestimate and handles the extrapolation error more straightforwardly. We propose\ntwo algorithms taking advantage of the estimated Q-function through\nregularizations, and demonstrate they exhibit strong performance on the D4RL\nbenchmarks.\n","authors":["Laixi Shi","Robert Dadashi","Yuejie Chi","Pablo Samuel Castro","Matthieu Geist"],"pdf_url":"https://arxiv.org/pdf/2307.13824v1.pdf","comment":"Published at European Conference on Machine Learning (ECML), 2023"}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.13537v1","updated":"2023-07-25T14:35:25Z","published":"2023-07-25T14:35:25Z","title":"Spectrum-guided Multi-granularity Referring Video Object Segmentation","summary":"  Current referring video object segmentation (R-VOS) techniques extract\nconditional kernels from encoded (low-resolution) vision-language features to\nsegment the decoded high-resolution features. We discovered that this causes\nsignificant feature drift, which the segmentation kernels struggle to perceive\nduring the forward computation. This negatively affects the ability of\nsegmentation kernels. To address the drift problem, we propose a\nSpectrum-guided Multi-granularity (SgMg) approach, which performs direct\nsegmentation on the encoded features and employs visual details to further\noptimize the masks. In addition, we propose Spectrum-guided Cross-modal Fusion\n(SCF) to perform intra-frame global interactions in the spectral domain for\neffective multimodal representation. Finally, we extend SgMg to perform\nmulti-object R-VOS, a new paradigm that enables simultaneous segmentation of\nmultiple referred objects in a video. This not only makes R-VOS faster, but\nalso more practical. Extensive experiments show that SgMg achieves\nstate-of-the-art performance on four video benchmark datasets, outperforming\nthe nearest competitor by 2.8% points on Ref-YouTube-VOS. Our extended SgMg\nenables multi-object R-VOS, runs about 3 times faster while maintaining\nsatisfactory performance. Code is available at https://github.com/bo-miao/SgMg.\n","authors":["Bo Miao","Mohammed Bennamoun","Yongsheng Gao","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2307.13537v1.pdf","comment":"Accepted by ICCV 2023, code is at https://github.com/bo-miao/SgMg"},{"id":"http://arxiv.org/abs/2307.13346v1","updated":"2023-07-25T09:03:27Z","published":"2023-07-25T09:03:27Z","title":"A Snoring Sound Dataset for Body Position Recognition: Collection,\n  Annotation, and Analysis","summary":"  Obstructive Sleep Apnea-Hypopnea Syndrome (OSAHS) is a chronic breathing\ndisorder caused by a blockage in the upper airways. Snoring is a prominent\nsymptom of OSAHS, and previous studies have attempted to identify the\nobstruction site of the upper airways by snoring sounds. Despite some progress,\nthe classification of the obstruction site remains challenging in real-world\nclinical settings due to the influence of sleep body position on upper airways.\nTo address this challenge, this paper proposes a snore-based sleep body\nposition recognition dataset (SSBPR) consisting of 7570 snoring recordings,\nwhich comprises six distinct labels for sleep body position: supine, supine but\nleft lateral head, supine but right lateral head, left-side lying, right-side\nlying and prone. Experimental results show that snoring sounds exhibit certain\nacoustic features that enable their effective utilization for identifying body\nposture during sleep in real-world scenarios.\n","authors":["Li Xiao","Xiuping Yang","Xinhong Li","Weiping Tu","Xiong Chen","Weiyan Yi","Jie Lin","Yuhong Yang","Yanzhen Ren"],"pdf_url":"https://arxiv.org/pdf/2307.13346v1.pdf","comment":"Accepted to INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2307.13236v1","updated":"2023-07-25T03:59:04Z","published":"2023-07-25T03:59:04Z","title":"Audio-aware Query-enhanced Transformer for Audio-Visual Segmentation","summary":"  The goal of the audio-visual segmentation (AVS) task is to segment the\nsounding objects in the video frames using audio cues. However, current\nfusion-based methods have the performance limitations due to the small\nreceptive field of convolution and inadequate fusion of audio-visual features.\nTo overcome these issues, we propose a novel \\textbf{Au}dio-aware\nquery-enhanced \\textbf{TR}ansformer (AuTR) to tackle the task. Unlike existing\nmethods, our approach introduces a multimodal transformer architecture that\nenables deep fusion and aggregation of audio-visual features. Furthermore, we\ndevise an audio-aware query-enhanced transformer decoder that explicitly helps\nthe model focus on the segmentation of the pinpointed sounding objects based on\naudio signals, while disregarding silent yet salient objects. Experimental\nresults show that our method outperforms previous methods and demonstrates\nbetter generalization ability in multi-sound and open-set scenarios.\n","authors":["Jinxiang Liu","Chen Ju","Chaofan Ma","Yanfeng Wang","Yu Wang","Ya Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.13236v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2305.11019"},{"id":"http://arxiv.org/abs/2307.13205v1","updated":"2023-07-25T02:08:28Z","published":"2023-07-25T02:08:28Z","title":"Text-oriented Modality Reinforcement Network for Multimodal Sentiment\n  Analysis from Unaligned Multimodal Sequences","summary":"  Multimodal Sentiment Analysis (MSA) aims to mine sentiment information from\ntext, visual, and acoustic modalities. Previous works have focused on\nrepresentation learning and feature fusion strategies. However, most of these\nefforts ignored the disparity in the semantic richness of different modalities\nand treated each modality in the same manner. That may lead to strong\nmodalities being neglected and weak modalities being overvalued. Motivated by\nthese observations, we propose a Text-oriented Modality Reinforcement Network\n(TMRN), which focuses on the dominance of the text modality in MSA. More\nspecifically, we design a Text-Centered Cross-modal Attention (TCCA) module to\nmake full interaction for text/acoustic and text/visual pairs, and a Text-Gated\nSelf-Attention (TGSA) module to guide the self-reinforcement of the other two\nmodalities. Furthermore, we present an adaptive fusion mechanism to decide the\nproportion of different modalities involved in the fusion process. Finally, we\ncombine the feature matrices into vectors to get the final representation for\nthe downstream tasks. Experimental results show that our TMRN outperforms the\nstate-of-the-art methods on two MSA benchmarks.\n","authors":["Yuxuan Lei","Dingkang Yang","Mingcheng Li","Shunli Wang","Jiawei Chen","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.13205v1.pdf","comment":"Accepted by CICAI 2023 (Finalist of Best Student Paper Award)"}]},"2023-07-26T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.14334v1","updated":"2023-07-26T17:52:22Z","published":"2023-07-26T17:52:22Z","title":"Towards Generalist Biomedical AI","summary":"  Medicine is inherently multimodal, with rich data modalities spanning text,\nimaging, genomics, and more. Generalist biomedical artificial intelligence (AI)\nsystems that flexibly encode, integrate, and interpret this data at scale can\npotentially enable impactful applications ranging from scientific discovery to\ncare delivery. To enable the development of these models, we first curate\nMultiMedBench, a new multimodal biomedical benchmark. MultiMedBench encompasses\n14 diverse tasks such as medical question answering, mammography and\ndermatology image interpretation, radiology report generation and\nsummarization, and genomic variant calling. We then introduce Med-PaLM\nMultimodal (Med-PaLM M), our proof of concept for a generalist biomedical AI\nsystem. Med-PaLM M is a large multimodal generative model that flexibly encodes\nand interprets biomedical data including clinical language, imaging, and\ngenomics with the same set of model weights. Med-PaLM M reaches performance\ncompetitive with or exceeding the state of the art on all MultiMedBench tasks,\noften surpassing specialist models by a wide margin. We also report examples of\nzero-shot generalization to novel medical concepts and tasks, positive transfer\nlearning across tasks, and emergent zero-shot medical reasoning. To further\nprobe the capabilities and limitations of Med-PaLM M, we conduct a radiologist\nevaluation of model-generated (and human) chest X-ray reports and observe\nencouraging performance across model scales. In a side-by-side ranking on 246\nretrospective chest X-rays, clinicians express a pairwise preference for\nMed-PaLM M reports over those produced by radiologists in up to 40.50% of\ncases, suggesting potential clinical utility. While considerable work is needed\nto validate these models in real-world use cases, our results represent a\nmilestone towards the development of generalist biomedical AI systems.\n","authors":["Tao Tu","Shekoofeh Azizi","Danny Driess","Mike Schaekermann","Mohamed Amin","Pi-Chuan Chang","Andrew Carroll","Chuck Lau","Ryutaro Tanno","Ira Ktena","Basil Mustafa","Aakanksha Chowdhery","Yun Liu","Simon Kornblith","David Fleet","Philip Mansfield","Sushant Prakash","Renee Wong","Sunny Virmani","Christopher Semturs","S Sara Mahdavi","Bradley Green","Ewa Dominowska","Blaise Aguera y Arcas","Joelle Barral","Dale Webster","Greg S. Corrado","Yossi Matias","Karan Singhal","Pete Florence","Alan Karthikesalingam","Vivek Natarajan"],"pdf_url":"https://arxiv.org/pdf/2307.14334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14324v1","updated":"2023-07-26T17:42:43Z","published":"2023-07-26T17:42:43Z","title":"Evaluating the Moral Beliefs Encoded in LLMs","summary":"  This paper presents a case study on the design, administration,\npost-processing, and evaluation of surveys on large language models (LLMs). It\ncomprises two components: (1) A statistical method for eliciting beliefs\nencoded in LLMs. We introduce statistical measures and evaluation metrics that\nquantify the probability of an LLM \"making a choice\", the associated\nuncertainty, and the consistency of that choice. (2) We apply this method to\nstudy what moral beliefs are encoded in different LLMs, especially in ambiguous\ncases where the right choice is not obvious. We design a large-scale survey\ncomprising 680 high-ambiguity moral scenarios (e.g., \"Should I tell a white\nlie?\") and 687 low-ambiguity moral scenarios (e.g., \"Should I stop for a\npedestrian on the road?\"). Each scenario includes a description, two possible\nactions, and auxiliary labels indicating violated rules (e.g., \"do not kill\").\nWe administer the survey to 28 open- and closed-source LLMs. We find that (a)\nin unambiguous scenarios, most models \"choose\" actions that align with\ncommonsense. In ambiguous cases, most models express uncertainty. (b) Some\nmodels are uncertain about choosing the commonsense action because their\nresponses are sensitive to the question-wording. (c) Some models reflect clear\npreferences in ambiguous scenarios. Specifically, closed-source models tend to\nagree with each other.\n","authors":["Nino Scherrer","Claudia Shi","Amir Feder","David M. Blei"],"pdf_url":"https://arxiv.org/pdf/2307.14324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14311v1","updated":"2023-07-26T17:21:53Z","published":"2023-07-26T17:21:53Z","title":"Comparative Analysis of Libraries for the Sentimental Analysis","summary":"  This study is main goal is to provide a comparative comparison of libraries\nusing machine learning methods. Experts in natural language processing (NLP)\nare becoming more and more interested in sentiment analysis (SA) of text\nchanges. The objective of employing NLP text analysis techniques is to\nrecognize and categorize feelings related to twitter users utterances. In this\nexamination, issues with SA and the libraries utilized are also looked at.\nprovides a number of cooperative methods to classify emotional polarity. The\nNaive Bayes Classifier, Decision Tree Classifier, Maxent Classifier, Sklearn\nClassifier, Sklearn Classifier MultinomialNB, and other conjoint learning\nalgorithms, according to recent research, are very effective. In the project\nwill use Five Python and R libraries NLTK, TextBlob, Vader, Transformers (GPT\nand BERT pretrained), and Tidytext will be used in the study to apply sentiment\nanalysis techniques. Four machine learning models Tree of Decisions (DT),\nSupport Vector Machine (SVM), Naive Bayes (NB), and K-Nearest Neighbor (KNN)\nwill also be used. To evaluate how well libraries for SA operate in the social\nnetwork environment, comparative study was also carried out. The measures to\nassess the best algorithms in this experiment, which used a single data set for\neach method, were precision, recall, and F1 score. We conclude that the BERT\ntransformer method with an Accuracy: 0.973 is recommended for sentiment\nanalysis.\n","authors":["Wendy Ccoya","Edson Pinto"],"pdf_url":"https://arxiv.org/pdf/2307.14311v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.14305v1","updated":"2023-07-26T17:13:00Z","published":"2023-07-26T17:13:00Z","title":"Automatically Evaluating Opinion Prevalence in Opinion Summarization","summary":"  When faced with a large number of product reviews, it is not clear that a\nhuman can remember all of them and weight opinions representatively to write a\ngood reference summary. We propose an automatic metric to test the prevalence\nof the opinions that a summary expresses, based on counting the number of\nreviews that are consistent with each statement in the summary, while\ndiscrediting trivial or redundant statements. To formulate this opinion\nprevalence metric, we consider several existing methods to score the factual\nconsistency of a summary statement with respect to each individual source\nreview. On a corpus of Amazon product reviews, we gather multiple human\njudgments of the opinion consistency, to determine which automatic metric best\nexpresses consistency in product reviews. Using the resulting opinion\nprevalence metric, we show that a human authored summary has only slightly\nbetter opinion prevalence than randomly selected extracts from the source\nreviews, and previous extractive and abstractive unsupervised opinion\nsummarization methods perform worse than humans. We demonstrate room for\nimprovement with a greedy construction of extractive summaries with twice the\nopinion prevalence achieved by humans. Finally, we show that preprocessing\nsource reviews by simplification can raise the opinion prevalence achieved by\nexisting abstractive opinion summarization systems to the level of human\nperformance.\n","authors":["Christopher Malon"],"pdf_url":"https://arxiv.org/pdf/2307.14305v1.pdf","comment":"The 6th Workshop on e-Commerce and NLP (KDD 2023)"},{"id":"http://arxiv.org/abs/2307.14298v1","updated":"2023-07-26T16:58:10Z","published":"2023-07-26T16:58:10Z","title":"ChatGPT and Persuasive Technologies for the Management and Delivery of\n  Personalized Recommendations in Hotel Hospitality","summary":"  Recommender systems have become indispensable tools in the hotel hospitality\nindustry, enabling personalized and tailored experiences for guests. Recent\nadvancements in large language models (LLMs), such as ChatGPT, and persuasive\ntechnologies, have opened new avenues for enhancing the effectiveness of those\nsystems. This paper explores the potential of integrating ChatGPT and\npersuasive technologies for automating and improving hotel hospitality\nrecommender systems. First, we delve into the capabilities of ChatGPT, which\ncan understand and generate human-like text, enabling more accurate and\ncontext-aware recommendations. We discuss the integration of ChatGPT into\nrecommender systems, highlighting the ability to analyze user preferences,\nextract valuable insights from online reviews, and generate personalized\nrecommendations based on guest profiles. Second, we investigate the role of\npersuasive technology in influencing user behavior and enhancing the persuasive\nimpact of hotel recommendations. By incorporating persuasive techniques, such\nas social proof, scarcity and personalization, recommender systems can\neffectively influence user decision-making and encourage desired actions, such\nas booking a specific hotel or upgrading their room. To investigate the\nefficacy of ChatGPT and persuasive technologies, we present a pilot experi-ment\nwith a case study involving a hotel recommender system. We aim to study the\nimpact of integrating ChatGPT and persua-sive techniques on user engagement,\nsatisfaction, and conversion rates. The preliminary results demonstrate the\npotential of these technologies in enhancing the overall guest experience and\nbusiness performance. Overall, this paper contributes to the field of hotel\nhospitality by exploring the synergistic relationship between LLMs and\npersuasive technology in recommender systems, ultimately influencing guest\nsatisfaction and hotel revenue.\n","authors":["Manolis Remountakis","Konstantinos Kotis","Babis Kourtzis","George E. Tsekouras"],"pdf_url":"https://arxiv.org/pdf/2307.14298v1.pdf","comment":"17 pages, 12 figures"},{"id":"http://arxiv.org/abs/2307.14291v1","updated":"2023-07-26T16:49:11Z","published":"2023-07-26T16:49:11Z","title":"Founding a mathematical diffusion model in linguistics. The case study\n  of German syntactic features in the North-Eastern Italian dialects","summary":"  We take as a case study the spread of Germanic syntactic features into\nRomance dialects of North-Eastern Italy, which occurred after the immigration\nof German people in the Tyrol during the High Middle Ages.\n  An interactive map is produced using tools of what is called Geographic Data\nScience. A smooth two-dimensional surface $\\mathcal{G}$ expresses locally which\nfraction of territory uses a given German language feature: it is obtained by\ninterpolating a discrete function that says if at any surveyed locality that\nfeature is used or not.\\newline\n  This surface $\\mathcal{G}$ is thought of as the value at the present time of\na function describing a diffusion-convection phenomenon in two dimensions (here\nsaid \\emph{tidal} mode), which is subjected in a very natural way to the same\nequation, suitably contextualized, used in physics for a number of\nphenomenological facts like the heat diffusion. It is shown that solutions of\nthis equation, evaluated at the present time, fit well with the data as\ninterpolated by $\\mathcal{G}$, thus providing convincing pictures of\ndiffusion-convection of the linguistic features of the case study, albeit\nsimplifications and approximations.\\newline\n  Very importantly, it is shown that Schmidt's 'waves' can be counted among the\nsolutions of the diffusion equation: superimposing Schmidt 'waves' to a 'tidal\nflooding' can reproduce complexities of real linguistic diffusion events.\n","authors":["I. Lazzizzera"],"pdf_url":"https://arxiv.org/pdf/2307.14291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13617v2","updated":"2023-07-26T16:14:24Z","published":"2023-07-25T16:21:07Z","title":"GPT-3 Models are Few-Shot Financial Reasoners","summary":"  Financial analysis is an important tool for evaluating company performance.\nPractitioners work to answer financial questions to make profitable investment\ndecisions, and use advanced quantitative analyses to do so. As a result,\nFinancial Question Answering (QA) is a question answering task that requires\ndeep reasoning about numbers. Furthermore, it is unknown how well pre-trained\nlanguage models can reason in the financial domain. The current\nstate-of-the-art requires a retriever to collect relevant facts about the\nfinancial question from the text and a generator to produce a valid financial\nprogram and a final answer. However, recently large language models like GPT-3\nhave achieved state-of-the-art performance on wide variety of tasks with just a\nfew shot examples. We run several experiments with GPT-3 and find that a\nseparate retrieval model and logic engine continue to be essential components\nto achieving SOTA performance in this task, particularly due to the precise\nnature of financial questions and the complex information stored in financial\ndocuments. With this understanding, our refined prompt-engineering approach on\nGPT-3 achieves near SOTA accuracy without any fine-tuning.\n","authors":["Raul Salles de Padua","Imran Qureshi","Mustafa U. Karakaplan"],"pdf_url":"https://arxiv.org/pdf/2307.13617v2.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.06281v2","updated":"2023-07-26T16:02:57Z","published":"2023-07-12T16:23:09Z","title":"MMBench: Is Your Multi-modal Model an All-around Player?","summary":"  Large vision-language models have recently achieved remarkable progress,\nexhibiting great perception and reasoning abilities concerning visual\ninformation. However, how to effectively evaluate these large vision-language\nmodels remains a major obstacle, hindering future model development.\nTraditional benchmarks like VQAv2 or COCO Caption provide quantitative\nperformance measurements but suffer from a lack of fine-grained ability\nassessment and non-robust evaluation metrics. Recent subjective benchmarks,\nsuch as OwlEval, offer comprehensive evaluations of a model's abilities by\nincorporating human labor, but they are not scalable and display significant\nbias. In response to these challenges, we propose MMBench, a novel\nmulti-modality benchmark. MMBench methodically develops a comprehensive\nevaluation pipeline, primarily comprised of two elements. The first element is\na meticulously curated dataset that surpasses existing similar benchmarks in\nterms of the number and variety of evaluation questions and abilities. The\nsecond element introduces a novel CircularEval strategy and incorporates the\nuse of ChatGPT. This implementation is designed to convert free-form\npredictions into pre-defined choices, thereby facilitating a more robust\nevaluation of the model's predictions. MMBench is a systematically-designed\nobjective benchmark for robustly evaluating the various abilities of\nvision-language models. We hope MMBench will assist the research community in\nbetter evaluating their models and encourage future advancements in this\ndomain. Project page: https://opencompass.org.cn/mmbench.\n","authors":["Yuan Liu","Haodong Duan","Yuanhan Zhang","Bo Li","Songyang Zhang","Wangbo Zhao","Yike Yuan","Jiaqi Wang","Conghui He","Ziwei Liu","Kai Chen","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2307.06281v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.12645v2","updated":"2023-07-26T15:49:48Z","published":"2022-02-25T12:10:02Z","title":"Exploring Multi-Modal Representations for Ambiguity Detection &\n  Coreference Resolution in the SIMMC 2.0 Challenge","summary":"  Anaphoric expressions, such as pronouns and referential descriptions, are\nsituated with respect to the linguistic context of prior turns, as well as, the\nimmediate visual environment. However, a speaker's referential descriptions do\nnot always uniquely identify the referent, leading to ambiguities in need of\nresolution through subsequent clarificational exchanges. Thus, effective\nAmbiguity Detection and Coreference Resolution are key to task success in\nConversational AI. In this paper, we present models for these two tasks as part\nof the SIMMC 2.0 Challenge (Kottur et al. 2021). Specifically, we use TOD-BERT\nand LXMERT based models, compare them to a number of baselines and provide\nablation experiments. Our results show that (1) language models are able to\nexploit correlations in the data to detect ambiguity; and (2) unimodal\ncoreference resolution models can avoid the need for a vision component,\nthrough the use of smart object representations.\n","authors":["Javier Chiyah-Garcia","Alessandro Suglia","José Lopes","Arash Eshghi","Helen Hastie"],"pdf_url":"https://arxiv.org/pdf/2202.12645v2.pdf","comment":"Accepted to AAAI 2022 DSTC10 Workshop"},{"id":"http://arxiv.org/abs/2307.13528v2","updated":"2023-07-26T15:17:49Z","published":"2023-07-25T14:20:51Z","title":"FacTool: Factuality Detection in Generative AI -- A Tool Augmented\n  Framework for Multi-Task and Multi-Domain Scenarios","summary":"  The emergence of generative pre-trained models has facilitated the synthesis\nof high-quality text, but it has also posed challenges in identifying factual\nerrors in the generated text. In particular: (1) A wider range of tasks now\nface an increasing risk of containing factual errors when handled by generative\nmodels. (2) Generated texts tend to be lengthy and lack a clearly defined\ngranularity for individual facts. (3) There is a scarcity of explicit evidence\navailable during the process of fact checking. With the above challenges in\nmind, in this paper, we propose FacTool, a task and domain agnostic framework\nfor detecting factual errors of texts generated by large language models (e.g.,\nChatGPT). Experiments on four different tasks (knowledge-based QA, code\ngeneration, mathematical reasoning, and scientific literature review) show the\nefficacy of the proposed method. We release the code of FacTool associated with\nChatGPT plugin interface at https://github.com/GAIR-NLP/factool .\n","authors":["I-Chun Chern","Steffi Chern","Shiqi Chen","Weizhe Yuan","Kehua Feng","Chunting Zhou","Junxian He","Graham Neubig","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13528v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14236v1","updated":"2023-07-26T15:04:24Z","published":"2023-07-26T15:04:24Z","title":"UnScientify: Detecting Scientific Uncertainty in Scholarly Full Text","summary":"  This demo paper presents UnScientify, an interactive system designed to\ndetect scientific uncertainty in scholarly full text. The system utilizes a\nweakly supervised technique that employs a fine-grained annotation scheme to\nidentify verbally formulated uncertainty at the sentence level in scientific\ntexts. The pipeline for the system includes a combination of pattern matching,\ncomplex sentence checking, and authorial reference checking. Our approach\nautomates labeling and annotation tasks for scientific uncertainty\nidentification, taking into account different types of scientific uncertainty,\nthat can serve various applications such as information retrieval, text mining,\nand scholarly document processing. Additionally, UnScientify provides\ninterpretable results, aiding in the comprehension of identified instances of\nscientific uncertainty in text.\n","authors":["Panggih Kusuma Ningrum","Philipp Mayr","Iana Atanassova"],"pdf_url":"https://arxiv.org/pdf/2307.14236v1.pdf","comment":"Paper accepted for the Joint Workshop of the 4th Extraction and\n  Evaluation of Knowledge Entities from Scientific Documents and the 3rd AI +\n  Informetrics (EEKE-AII2023), June 26, 2023, Santa Fe, New Mexico, USA and\n  Online"},{"id":"http://arxiv.org/abs/2306.00017v3","updated":"2023-07-26T15:03:09Z","published":"2023-05-30T15:15:40Z","title":"Towards Explainable and Language-Agnostic LLMs: Symbolic Reverse\n  Engineering of Language at Scale","summary":"  Large language models (LLMs) have achieved a milestone that undenia-bly\nchanged many held beliefs in artificial intelligence (AI). However, there\nremains many limitations of these LLMs when it comes to true language\nunderstanding, limitations that are a byproduct of the under-lying architecture\nof deep neural networks. Moreover, and due to their subsymbolic nature,\nwhatever knowledge these models acquire about how language works will always be\nburied in billions of microfeatures (weights), none of which is meaningful on\nits own, making such models hopelessly unexplainable. To address these\nlimitations, we suggest com-bining the strength of symbolic representations\nwith what we believe to be the key to the success of LLMs, namely a successful\nbottom-up re-verse engineering of language at scale. As such we argue for a\nbottom-up reverse engineering of language in a symbolic setting. Hints on what\nthis project amounts to have been suggested by several authors, and we discuss\nin some detail here how this project could be accomplished.\n","authors":["Walid S. Saba"],"pdf_url":"https://arxiv.org/pdf/2306.00017v3.pdf","comment":"Draft, preprint"},{"id":"http://arxiv.org/abs/2307.10930v2","updated":"2023-07-26T14:21:47Z","published":"2023-07-20T14:59:02Z","title":"MediaGPT : A Large Language Model For Chinese Media","summary":"  Large language models (LLMs) have shown remarkable capabilities in generating\nhigh-quality text and making predictions based on large amounts of data,\nincluding the media domain. However, in practical applications, the differences\nbetween the media's use cases and the general-purpose applications of LLMs have\nbecome increasingly apparent, especially Chinese. This paper examines the\nunique characteristics of media-domain-specific LLMs compared to general LLMs,\ndesigned a diverse set of task instruction types to cater the specific\nrequirements of the domain and constructed unique datasets that are tailored to\nthe media domain. Based on these, we proposed MediaGPT, a domain-specific LLM\nfor the Chinese media domain, training by domain-specific data and experts SFT\ndata. By performing human experts evaluation and strong model evaluation on a\nvalidation set, this paper demonstrated that MediaGPT outperforms mainstream\nmodels on various Chinese media domain tasks and verifies the importance of\ndomain data and domain-defined prompt types for building an effective\ndomain-specific LLM.\n","authors":["Zhonghao Wang","Zijia Lu","Bo Jin","Haiying Deng"],"pdf_url":"https://arxiv.org/pdf/2307.10930v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14142v1","updated":"2023-07-26T12:13:00Z","published":"2023-07-26T12:13:00Z","title":"LOIS: Looking Out of Instance Semantics for Visual Question Answering","summary":"  Visual question answering (VQA) has been intensively studied as a multimodal\ntask that requires effort in bridging vision and language to infer answers\ncorrectly. Recent attempts have developed various attention-based modules for\nsolving VQA tasks. However, the performance of model inference is largely\nbottlenecked by visual processing for semantics understanding. Most existing\ndetection methods rely on bounding boxes, remaining a serious challenge for VQA\nmodels to understand the causal nexus of object semantics in images and\ncorrectly infer contextual information. To this end, we propose a finer model\nframework without bounding boxes in this work, termed Looking Out of Instance\nSemantics (LOIS) to tackle this important issue. LOIS enables more fine-grained\nfeature descriptions to produce visual facts. Furthermore, to overcome the\nlabel ambiguity caused by instance masks, two types of relation attention\nmodules: 1) intra-modality and 2) inter-modality, are devised to infer the\ncorrect answers from the different multi-view features. Specifically, we\nimplement a mutual relation attention module to model sophisticated and deeper\nvisual semantic relations between instance objects and background information.\nIn addition, our proposed attention model can further analyze salient image\nregions by focusing on important word-related questions. Experimental results\non four benchmark VQA datasets prove that our proposed method has favorable\nperformance in improving visual reasoning capability.\n","authors":["Siyu Zhang","Yeming Chen","Yaoru Sun","Fang Wang","Haibo Shi","Haoran Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14134v1","updated":"2023-07-26T12:02:30Z","published":"2023-07-26T12:02:30Z","title":"Developing and Evaluating Tiny to Medium-Sized Turkish BERT Models","summary":"  This study introduces and evaluates tiny, mini, small, and medium-sized\nuncased Turkish BERT models, aiming to bridge the research gap in\nless-resourced languages. We trained these models on a diverse dataset\nencompassing over 75GB of text from multiple sources and tested them on several\ntasks, including mask prediction, sentiment analysis, news classification, and,\nzero-shot classification. Despite their smaller size, our models exhibited\nrobust performance, including zero-shot task, while ensuring computational\nefficiency and faster execution times. Our findings provide valuable insights\ninto the development and application of smaller language models, especially in\nthe context of the Turkish language.\n","authors":["Himmet Toprak Kesgin","Muzaffer Kaan Yuce","Mehmet Fatih Amasyali"],"pdf_url":"https://arxiv.org/pdf/2307.14134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14132v1","updated":"2023-07-26T11:59:14Z","published":"2023-07-26T11:59:14Z","title":"Say Goodbye to RNN-T Loss: A Novel CIF-based Transducer Architecture for\n  Automatic Speech Recognition","summary":"  RNN-T models are widely used in ASR, which rely on the RNN-T loss to achieve\nlength alignment between input audio and target sequence. However, the\nimplementation complexity and the alignment-based optimization target of RNN-T\nloss lead to computational redundancy and a reduced role for predictor network,\nrespectively. In this paper, we propose a novel model named CIF-Transducer\n(CIF-T) which incorporates the Continuous Integrate-and-Fire (CIF) mechanism\nwith the RNN-T model to achieve efficient alignment. In this way, the RNN-T\nloss is abandoned, thus bringing a computational reduction and allowing the\npredictor network a more significant role. We also introduce Funnel-CIF,\nContext Blocks, Unified Gating and Bilinear Pooling joint network, and\nauxiliary training strategy to further improve performance. Experiments on the\n178-hour AISHELL-1 and 10000-hour WenetSpeech datasets show that CIF-T achieves\nstate-of-the-art results with lower computational overhead compared to RNN-T\nmodels.\n","authors":["Tian-Hao Zhang","Dinghao Zhou","Guiping Zhon","Baoxiang Li"],"pdf_url":"https://arxiv.org/pdf/2307.14132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14117v1","updated":"2023-07-26T11:34:53Z","published":"2023-07-26T11:34:53Z","title":"Leveraging Implicit Feedback from Deployment Data in Dialogue","summary":"  We study improving social conversational agents by learning from natural\ndialogue between users and a deployed model, without extra annotations. To\nimplicitly measure the quality of a machine-generated utterance, we leverage\nsignals like user response length, sentiment and reaction of the future human\nutterances in the collected dialogue episodes. Our experiments use the publicly\nreleased deployment data from BlenderBot (Xu et al., 2023). Human evaluation\nindicates improvements in our new models over baseline responses; however, we\nfind that some proxy signals can lead to more generations with undesirable\nproperties as well. For example, optimizing for conversation length can lead to\nmore controversial or unfriendly generations compared to the baseline, whereas\noptimizing for positive sentiment or reaction can decrease these behaviors.\n","authors":["Richard Yuanzhe Pang","Stephen Roller","Kyunghyun Cho","He He","Jason Weston"],"pdf_url":"https://arxiv.org/pdf/2307.14117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14107v1","updated":"2023-07-26T11:10:04Z","published":"2023-07-26T11:10:04Z","title":"Decoding ChatGPT: A Taxonomy of Existing Research, Current Challenges,\n  and Possible Future Directions","summary":"  Chat Generative Pre-trained Transformer (ChatGPT) has gained significant\ninterest and attention since its launch in November 2022. It has shown\nimpressive performance in various domains, including passing exams and creative\nwriting. However, challenges and concerns related to biases and trust persist.\nIn this work, we present a comprehensive review of over 100 Scopus-indexed\npublications on ChatGPT, aiming to provide a taxonomy of ChatGPT research and\nexplore its applications. We critically analyze the existing literature,\nidentifying common approaches employed in the studies. Additionally, we\ninvestigate diverse application areas where ChatGPT has found utility, such as\nhealthcare, marketing and financial services, software engineering, academic\nand scientific writing, research and education, environmental science, and\nnatural language processing. Through examining these applications, we gain\nvaluable insights into the potential of ChatGPT in addressing real-world\nchallenges. We also discuss crucial issues related to ChatGPT, including biases\nand trustworthiness, emphasizing the need for further research and development\nin these areas. Furthermore, we identify potential future directions for\nChatGPT research, proposing solutions to current challenges and speculating on\nexpected advancements. By fully leveraging the capabilities of ChatGPT, we can\nunlock its potential across various domains, leading to advancements in\nconversational AI and transformative impacts in society.\n","authors":["Shahab Saquib Sohail","Faiza Farhat","Yassine Himeur","Mohammad Nadeem","Dag Øivind Madsen","Yashbir Singh","Shadi Atalla","Wathiq Mansoor"],"pdf_url":"https://arxiv.org/pdf/2307.14107v1.pdf","comment":"31 pages. 8 figures and 3 tables"},{"id":"http://arxiv.org/abs/2307.06440v2","updated":"2023-07-26T10:33:21Z","published":"2023-07-12T20:10:14Z","title":"No Train No Gain: Revisiting Efficient Training Algorithms For\n  Transformer-based Language Models","summary":"  The computation necessary for training Transformer-based language models has\nskyrocketed in recent years. This trend has motivated research on efficient\ntraining algorithms designed to improve training, validation, and downstream\nperformance faster than standard training. In this work, we revisit three\ncategories of such algorithms: dynamic architectures (layer stacking, layer\ndropping), batch selection (selective backprop, RHO loss), and efficient\noptimizers (Lion, Sophia). When pre-training BERT and T5 with a fixed\ncomputation budget using such methods, we find that their training, validation,\nand downstream gains vanish compared to a baseline with a fully-decayed\nlearning rate. We define an evaluation protocol that enables computation to be\ndone on arbitrary machines by mapping all computation time to a reference\nmachine which we call reference system time. We discuss the limitations of our\nproposed protocol and release our code to encourage rigorous research in\nefficient training procedures: https://github.com/JeanKaddour/NoTrainNoGain.\n","authors":["Jean Kaddour","Oscar Key","Piotr Nawrot","Pasquale Minervini","Matt J. Kusner"],"pdf_url":"https://arxiv.org/pdf/2307.06440v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03506v2","updated":"2023-07-26T09:06:22Z","published":"2023-07-07T10:42:44Z","title":"Derivative Free Weight-space Ensembling","summary":"  Recent work suggests that interpolating between the weights of two\nspecialized language models can transfer knowledge between tasks in a way that\nmulti-task learning cannot. However, very few have explored interpolation\nbetween more than two models, where each has a distinct knowledge base. In this\npaper, we introduce Derivative Free Weight-space Ensembling (DFWE), a new\nfew-sample task transfer approach for open-domain dialogue. Our framework\ncreates a set of diverse expert language models trained using a predefined set\nof source tasks. Next, we finetune each of the expert models on the target\ntask, approaching the target task from several distinct knowledge bases.\nFinally, we linearly interpolate between the model weights using a\ngradient-free-optimization algorithm, to efficiently find a good interpolation\nweighting. We demonstrate the effectiveness of the method on FETA-Friends\noutperforming the standard pretrain-finetune approach.\n","authors":["Dean Ninalga"],"pdf_url":"https://arxiv.org/pdf/2307.03506v2.pdf","comment":"For consideration at the 5th Workshop on NLP for Conversational AI\n  (co-located with ACL 2023)"},{"id":"http://arxiv.org/abs/2306.02377v2","updated":"2023-07-26T08:54:03Z","published":"2023-06-04T15:23:16Z","title":"\"Are you telling me to put glasses on the dog?'' Content-Grounded\n  Annotation of Instruction Clarification Requests in the CoDraw Dataset","summary":"  Instruction Clarification Requests are a mechanism to solve communication\nproblems, which is very functional in instruction-following interactions.\nRecent work has argued that the CoDraw dataset is a valuable source of\nnaturally occurring iCRs. Beyond identifying when iCRs should be made, dialogue\nmodels should also be able to generate them with suitable form and content. In\nthis work, we introduce CoDraw-iCR (v2), extending the existing iCR identifiers\nwith fine-grained information grounded in the underlying dialogue game items\nand possible actions. Our annotation can serve to model and evaluate repair\ncapabilities of dialogue agents.\n","authors":["Brielen Madureira","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2306.02377v2.pdf","comment":"A 2-page version will appear at SemDial 2023 as a poster"},{"id":"http://arxiv.org/abs/2307.14031v1","updated":"2023-07-26T08:29:42Z","published":"2023-07-26T08:29:42Z","title":"Multi3WOZ: A Multilingual, Multi-Domain, Multi-Parallel Dataset for\n  Training and Evaluating Culturally Adapted Task-Oriented Dialog Systems","summary":"  Creating high-quality annotated data for task-oriented dialog (ToD) is known\nto be notoriously difficult, and the challenges are amplified when the goal is\nto create equitable, culturally adapted, and large-scale ToD datasets for\nmultiple languages. Therefore, the current datasets are still very scarce and\nsuffer from limitations such as translation-based non-native dialogs with\ntranslation artefacts, small scale, or lack of cultural adaptation, among\nothers. In this work, we first take stock of the current landscape of\nmultilingual ToD datasets, offering a systematic overview of their properties\nand limitations. Aiming to reduce all the detected limitations, we then\nintroduce Multi3WOZ, a novel multilingual, multi-domain, multi-parallel ToD\ndataset. It is large-scale and offers culturally adapted dialogs in 4 languages\nto enable training and evaluation of multilingual and cross-lingual ToD\nsystems. We describe a complex bottom-up data collection process that yielded\nthe final dataset, and offer the first sets of baseline scores across different\nToD-related tasks for future reference, also highlighting its challenging\nnature.\n","authors":["Songbo Hu","Han Zhou","Mete Hergul","Milan Gritta","Guchun Zhang","Ignacio Iacobacci","Ivan Vulić","Anna Korhonen"],"pdf_url":"https://arxiv.org/pdf/2307.14031v1.pdf","comment":"A pre-MIT Press publication version for TACL"},{"id":"http://arxiv.org/abs/2307.14005v1","updated":"2023-07-26T07:36:25Z","published":"2023-07-26T07:36:25Z","title":"Unsupervised extraction of local and global keywords from a single text","summary":"  We propose an unsupervised, corpus-independent method to extract keywords\nfrom a single text. It is based on the spatial distribution of words and the\nresponse of this distribution to a random permutation of words. As compared to\nexisting methods (such as e.g. YAKE) our method has three advantages. First, it\nis significantly more effective at extracting keywords from long texts. Second,\nit allows inference of two types of keywords: local and global. Third, it\nuncovers basic themes in texts. Additionally, our method is\nlanguage-independent and applies to short texts. The results are obtained via\nhuman annotators with previous knowledge of texts from our database of\nclassical literary works (the agreement between annotators is from moderate to\nsubstantial). Our results are supported via human-independent arguments based\non the average length of extracted content words and on the average number of\nnouns in extracted words. We discuss relations of keywords with higher-order\ntextual features and reveal a connection between keywords and chapter\ndivisions.\n","authors":["Lida Aleksanyan","Armen E. Allahverdyan"],"pdf_url":"https://arxiv.org/pdf/2307.14005v1.pdf","comment":"10 pages, 1 figure"},{"id":"http://arxiv.org/abs/2307.14004v1","updated":"2023-07-26T07:34:19Z","published":"2023-07-26T07:34:19Z","title":"Affective Natural Language Generation of Event Descriptions through\n  Fine-grained Appraisal Conditions","summary":"  Models for affective text generation have shown a remarkable progress, but\nthey commonly rely only on basic emotion theories or valance/arousal values as\nconditions. This is appropriate when the goal is to create explicit emotion\nstatements (\"The kid is happy.\"). Emotions are, however, commonly communicated\nimplicitly. For instance, the emotional interpretation of an event (\"Their dog\ndied.\") does often not require an explicit emotion statement. In psychology,\nappraisal theories explain the link between a cognitive evaluation of an event\nand the potentially developed emotion. They put the assessment of the situation\non the spot, for instance regarding the own control or the responsibility for\nwhat happens. We hypothesize and subsequently show that including appraisal\nvariables as conditions in a generation framework comes with two advantages.\n(1) The generation model is informed in greater detail about what makes a\nspecific emotion and what properties it has. This leads to text generation that\nbetter fulfills the condition. (2) The variables of appraisal allow a user to\nperform a more fine-grained control of the generated text, by stating\nproperties of a situation instead of only providing the emotion category. Our\nBart and T5-based experiments with 7 emotions (Anger, Disgust, Fear, Guilt,\nJoy, Sadness, Shame), and 7 appraisals (Attention, Responsibility, Control,\nCircumstance, Pleasantness, Effort, Certainty) show that (1) adding appraisals\nduring training improves the accurateness of the generated texts by 10 pp in\nF1. Further, (2) the texts with appraisal variables are longer and contain more\ndetails. This exemplifies the greater control for users.\n","authors":["Yarik Menchaca Resendiz","Roman Klinger"],"pdf_url":"https://arxiv.org/pdf/2307.14004v1.pdf","comment":"Accepted to INLG 2023"},{"id":"http://arxiv.org/abs/2307.13989v1","updated":"2023-07-26T06:54:31Z","published":"2023-07-26T06:54:31Z","title":"This is not correct! Negation-aware Evaluation of Language Generation\n  Systems","summary":"  Large language models underestimate the impact of negations on how much they\nchange the meaning of a sentence. Therefore, learned evaluation metrics based\non these models are insensitive to negations. In this paper, we propose\nNegBLEURT, a negation-aware version of the BLEURT evaluation metric. For that,\nwe designed a rule-based sentence negation tool and used it to create the\nCANNOT negation evaluation dataset. Based on this dataset, we fine-tuned a\nsentence transformer and an evaluation metric to improve their negation\nsensitivity. Evaluating these models on existing benchmarks shows that our\nfine-tuned models outperform existing metrics on the negated sentences by far\nwhile preserving their base models' performances on other perturbations.\n","authors":["Miriam Anschütz","Diego Miguel Lozano","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2307.13989v1.pdf","comment":"Accepted to INLG 2023"},{"id":"http://arxiv.org/abs/2307.08072v2","updated":"2023-07-26T04:15:48Z","published":"2023-07-16T15:11:01Z","title":"Do Emergent Abilities Exist in Quantized Large Language Models: An\n  Empirical Study","summary":"  Despite the superior performance, Large Language Models~(LLMs) require\nsignificant computational resources for deployment and use. To overcome this\nissue, quantization methods have been widely applied to reduce the memory\nfootprint of LLMs as well as increasing the inference rate. However, a major\nchallenge is that low-bit quantization methods often lead to performance\ndegradation. It is important to understand how quantization impacts the\ncapacity of LLMs. Different from previous studies focused on overall\nperformance, this work aims to investigate the impact of quantization on\n\\emph{emergent abilities}, which are important characteristics that distinguish\nLLMs from small language models. Specially, we examine the abilities of\nin-context learning, chain-of-thought reasoning, and instruction-following in\nquantized LLMs. Our empirical experiments show that these emergent abilities\nstill exist in 4-bit quantization models, while 2-bit models encounter severe\nperformance degradation on the test of these abilities. To improve the\nperformance of low-bit models, we conduct two special experiments: (1)\nfine-gained impact analysis that studies which components (or substructures)\nare more sensitive to quantization, and (2) performance compensation through\nmodel fine-tuning. Our work derives a series of important findings to\nunderstand the impact of quantization on emergent abilities, and sheds lights\non the possibilities of extremely low-bit quantization for LLMs.\n","authors":["Peiyu Liu","Zikang Liu","Ze-Feng Gao","Dawei Gao","Wayne Xin Zhao","Yaliang Li","Bolin Ding","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2307.08072v2.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.13949v1","updated":"2023-07-26T04:03:25Z","published":"2023-07-26T04:03:25Z","title":"How Does Diffusion Influence Pretrained Language Models on\n  Out-of-Distribution Data?","summary":"  Transformer-based pretrained language models (PLMs) have achieved great\nsuccess in modern NLP. An important advantage of PLMs is good\nout-of-distribution (OOD) robustness. Recently, diffusion models have attracted\na lot of work to apply diffusion to PLMs. It remains under-explored how\ndiffusion influences PLMs on OOD data. The core of diffusion models is a\nforward diffusion process which gradually applies Gaussian noise to inputs, and\na reverse denoising process which removes noise. The noised input\nreconstruction is a fundamental ability of diffusion models. We directly\nanalyze OOD robustness by measuring the reconstruction loss, including testing\nthe abilities to reconstruct OOD data, and to detect OOD samples. Experiments\nare conducted by analyzing different training parameters and data statistical\nfeatures on eight datasets. It shows that finetuning PLMs with diffusion\ndegrades the reconstruction ability on OOD data. The comparison also shows that\ndiffusion models can effectively detect OOD samples, achieving state-of-the-art\nperformance in most of the datasets with an absolute accuracy improvement up to\n18%. These results indicate that diffusion reduces OOD robustness of PLMs.\n","authors":["Huazheng Wang","Daixuan Cheng","Haifeng Sun","Jingyu Wang","Qi Qi","Jianxin Liao","Jing Wang","Cong Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13949v1.pdf","comment":"Accepted by ECAI 2023"},{"id":"http://arxiv.org/abs/2307.13923v1","updated":"2023-07-26T02:45:38Z","published":"2023-07-26T02:45:38Z","title":"GrammarGPT: Exploring Open-Source LLMs for Native Chinese Grammatical\n  Error Correction with Supervised Fine-Tuning","summary":"  Grammatical error correction aims to correct ungrammatical sentences\nautomatically. Recently, some work has demonstrated the excellent capabilities\nof closed-source Large Language Models (LLMs, e.g., ChatGPT) in grammatical\nerror correction. However, the potential of open-source LLMs remains\nunexplored. In this paper, we introduced GrammarGPT, an open-source LLM, to\npreliminary explore its potential for native Chinese grammatical error\ncorrection. The core recipe of GrammarGPT is to leverage the hybrid dataset of\nChatGPT-generated and human-annotated. For grammatical errors with clues, we\nproposed a heuristic method to guide ChatGPT to generate ungrammatical\nsentences by providing those clues. For grammatical errors without clues, we\ncollected ungrammatical sentences from publicly available websites and manually\ncorrected them. In addition, we employed an error-invariant augmentation method\nto enhance the ability of the model to correct native Chinese grammatical\nerrors. We ultimately constructed about 1k parallel data and utilized these\ndata to fine-tune open-source LLMs (e.g., Phoenix, released by The Chinese\nUniversity of Hong Kong, Shenzhen) with instruction tuning. The experimental\nresults show that GrammarGPT outperforms the existing SOTA system\nsignificantly. Although model parameters are 20x larger than the SOTA baseline,\nthe required amount of data for instruction tuning is 1200x smaller,\nillustrating the potential of open-source LLMs on native CGEC. Our GrammarGPT\nranks $3^{rd}$ on NLPCC2023 SharedTask1, demonstrating our approach's\neffectiveness. The code and data are available at\n\\url{https://github.com/FreedomIntelligence/GrammarGPT}.\n","authors":["Yaxin Fan","Feng Jiang","Peifeng Li","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2307.13923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04192v2","updated":"2023-07-26T02:29:03Z","published":"2023-07-09T14:54:30Z","title":"SAS Video-QA: Self-Adaptive Sampling for Efficient Video\n  Question-Answering","summary":"  Video question--answering is a fundamental task in the field of video\nunderstanding. Although current vision--language models (VLMs) equipped with\nVideo Transformers have enabled temporal modeling and yielded superior results,\nthey are at the cost of huge computational power and thus too expensive to\ndeploy in real-time application scenarios. An economical workaround only\nsamples a small portion of frames to represent the main content of that video\nand tune an image--text model on these sampled frames. Recent video\nunderstanding models usually randomly sample a set of frames or clips,\nregardless of internal correlations between their visual contents, nor their\nrelevance to the problem. We argue that such kinds of aimless sampling may omit\nthe key frames from which the correct answer can be deduced, and the situation\ngets worse when the sampling sparsity increases, which always happens as the\nvideo lengths increase. To mitigate this issue, we propose two frame sampling\nstrategies, namely the most domain frames (MDF) and most implied frames (MIF),\nto maximally preserve those frames that are most likely vital to the given\nquestions. MDF passively minimizes the risk of key frame omission in a\nbootstrap manner, while MIS actively searches key frames customized for each\nvideo--question pair with the assistance of auxiliary models. The experimental\nresults on three public datasets from three advanced VLMs (CLIP, GIT and\nAll-in-one) demonstrate that our proposed strategies can boost the performance\nfor image--text pretrained models. The source codes pertaining to the method\nproposed in this paper are publicly available at\nhttps://github.com/declare-lab/sas-vqa.\n","authors":["Wei Han","Hui Chen","Min-Yen Kan","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2307.04192v2.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.02115v2","updated":"2023-07-26T02:20:30Z","published":"2023-06-03T14:01:54Z","title":"Table and Image Generation for Investigating Knowledge of Entities in\n  Pre-trained Vision and Language Models","summary":"  In this paper, we propose a table and image generation task to verify how the\nknowledge about entities acquired from natural language is retained in Vision &\nLanguage (V&L) models. This task consists of two parts: the first is to\ngenerate a table containing knowledge about an entity and its related image,\nand the second is to generate an image from an entity with a caption and a\ntable containing related knowledge of the entity. In both tasks, the model must\nknow the entities used to perform the generation properly. We created the\nWikipedia Table and Image Generation (WikiTIG) dataset from about 200,000\ninfoboxes in English Wikipedia articles to perform the proposed tasks. We\nevaluated the performance on the tasks with respect to the above research\nquestion using the V&L model OFA, which has achieved state-of-the-art results\nin multiple tasks. Experimental results show that OFA forgets part of its\nentity knowledge by pre-training as a complement to improve the performance of\nimage related tasks.\n","authors":["Hidetaka Kamigaito","Katsuhiko Hayashi","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2306.02115v2.pdf","comment":"Accepted at ACL 2023"},{"id":"http://arxiv.org/abs/2106.11483v9","updated":"2023-07-26T01:56:20Z","published":"2021-06-22T02:12:29Z","title":"A Comprehensive Comparison of Pre-training Language Models","summary":"  Recently, the development of pre-trained language models has brought natural\nlanguage processing (NLP) tasks to the new state-of-the-art. In this paper we\nexplore the efficiency of various pre-trained language models. We pre-train a\nlist of transformer-based models with the same amount of text and the same\ntraining steps. The experimental results shows that the most improvement upon\nthe origin BERT is adding the RNN-layer to capture more contextual information\nfor short text understanding. But the conclusion is: There are no remarkable\nimprovement for short text understanding for similar BERT structures.\nData-centric method[12] can achieve better performance.\n","authors":["Tong Guo"],"pdf_url":"https://arxiv.org/pdf/2106.11483v9.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13900v1","updated":"2023-07-26T01:48:52Z","published":"2023-07-26T01:48:52Z","title":"FinTree: Financial Dataset Pretrain Transformer Encoder for Relation\n  Extraction","summary":"  We present FinTree, Financial Dataset Pretrain Transformer Encoder for\nRelation Extraction. Utilizing an encoder language model, we further pretrain\nFinTree on the financial dataset, adapting the model in financial domain tasks.\nFinTree stands out with its novel structure that predicts a masked token\ninstead of the conventional [CLS] token, inspired by the Pattern Exploiting\nTraining methodology. This structure allows for more accurate relation\npredictions between two given entities. The model is trained with a unique\ninput pattern to provide contextual and positional information about the\nentities of interest, and a post-processing step ensures accurate predictions\nin line with the entity types. Our experiments demonstrate that FinTree\noutperforms on the REFinD, a large-scale financial relation extraction dataset.\nThe code and pretrained models are available at\nhttps://github.com/HJ-Ok/FinTree.\n","authors":["Hyunjong Ok"],"pdf_url":"https://arxiv.org/pdf/2307.13900v1.pdf","comment":"4pages, 2 figures, The SIGIR'23 Workshop on Knowledge Discovery from\n  Unstructured Data in Financial Services"},{"id":"http://arxiv.org/abs/2307.14544v1","updated":"2023-07-26T23:47:14Z","published":"2023-07-26T23:47:14Z","title":"Speed Reading Tool Powered by Artificial Intelligence for Students with\n  ADHD, Dyslexia, or Short Attention Span","summary":"  This paper presents a novel approach to assist students with dyslexia, ADHD,\nand short attention span in digesting any text-based information more\nefficiently. The proposed solution utilizes the Multilayer Perceptron (MLP)\nalgorithm for complex text processing and summarization tasks. The tool\nleverages the T5 (Text-to-Text Transfer Transformer) model from Hugging Face,\nwhich treats every NLP task as a text generation task. The model is fine-tuned\non specific tasks using a smaller dataset. The NLTK's Punkt Sentence Tokenizer\nis used to divide a text into a list of sentences. The application is served\nusing Flask, a lightweight web server and framework. The tool also applies\nprinciples from Bionic Reading to enhance readability, which includes a bolding\nfunction and adjustments to line, word, and character spacing. The paper\ndiscusses the methodology, implementation, and results of the AI-based speed\nreading tool.\n","authors":["Megat Irfan Zackry Bin Ismail Ahmad Nazran bin Yusri Muhammad Hafizzul Bin Abdul Manap Muhammad Muizzuddin Bin Kamarozaman"],"pdf_url":"https://arxiv.org/pdf/2307.14544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14539v1","updated":"2023-07-26T23:11:15Z","published":"2023-07-26T23:11:15Z","title":"Plug and Pray: Exploiting off-the-shelf components of Multi-Modal Models","summary":"  The rapid growth and increasing popularity of incorporating additional\nmodalities (e.g., vision) into large language models (LLMs) has raised\nsignificant security concerns. This expansion of modality, akin to adding more\ndoors to a house, unintentionally creates multiple access points for\nadversarial attacks. In this paper, by introducing adversarial embedding space\nattacks, we emphasize the vulnerabilities present in multi-modal systems that\noriginate from incorporating off-the-shelf components like public pre-trained\nencoders in a plug-and-play manner into these systems. In contrast to existing\nwork, our approach does not require access to the multi-modal system's weights\nor parameters but instead relies on the huge under-explored embedding space of\nsuch pre-trained encoders. Our proposed embedding space attacks involve seeking\ninput images that reside within the dangerous or targeted regions of the\nextensive embedding space of these pre-trained components. These crafted\nadversarial images pose two major threats: 'Context Contamination' and 'Hidden\nPrompt Injection'-both of which can compromise multi-modal models like LLaVA\nand fully change the behavior of the associated language model. Our findings\nemphasize the need for a comprehensive examination of the underlying\ncomponents, particularly pre-trained encoders, before incorporating them into\nsystems in a plug-and-play manner to ensure robust security.\n","authors":["Erfan Shayegani","Yue Dong","Nael Abu-Ghazaleh"],"pdf_url":"https://arxiv.org/pdf/2307.14539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14522v1","updated":"2023-07-26T21:49:14Z","published":"2023-07-26T21:49:14Z","title":"CliniDigest: A Case Study in Large Language Model Based Large-Scale\n  Summarization of Clinical Trial Descriptions","summary":"  A clinical trial is a study that evaluates new biomedical interventions. To\ndesign new trials, researchers draw inspiration from those current and\ncompleted. In 2022, there were on average more than 100 clinical trials\nsubmitted to ClinicalTrials.gov every day, with each trial having a mean of\napproximately 1500 words [1]. This makes it nearly impossible to keep up to\ndate. To mitigate this issue, we have created a batch clinical trial summarizer\ncalled CliniDigest using GPT-3.5. CliniDigest is, to our knowledge, the first\ntool able to provide real-time, truthful, and comprehensive summaries of\nclinical trials. CliniDigest can reduce up to 85 clinical trial descriptions\n(approximately 10,500 words) into a concise 200-word summary with references\nand limited hallucinations. We have tested CliniDigest on its ability to\nsummarize 457 trials divided across 27 medical subdomains. For each field,\nCliniDigest generates summaries of $\\mu=153,\\ \\sigma=69 $ words, each of which\nutilizes $\\mu=54\\%,\\ \\sigma=30\\% $ of the sources. A more comprehensive\nevaluation is planned and outlined in this paper.\n","authors":["Renee D. White","Tristan Peng","Pann Sripitak","Alexander Rosenberg Johansen","Michael Snyder","Stanford University"],"pdf_url":"https://arxiv.org/pdf/2307.14522v1.pdf","comment":"7 pages, 3 figures, 3 tables, conference: ACM GoodIt 23'"},{"id":"http://arxiv.org/abs/2307.14511v1","updated":"2023-07-26T21:20:03Z","published":"2023-07-26T21:20:03Z","title":"Words That Stick: Predicting Decision Making and Synonym Engagement\n  Using Cognitive Biases and Computational Linguistics","summary":"  This research draws upon cognitive psychology and information systems studies\nto anticipate user engagement and decision-making on digital platforms. By\nemploying natural language processing (NLP) techniques and insights from\ncognitive bias research, we delve into user interactions with synonyms within\ndigital content. Our methodology synthesizes four cognitive\nbiasesRepresentativeness, Ease-of-use, Affect, and Distributioninto the READ\nmodel. Through a comprehensive user survey, we assess the model's ability to\npredict user engagement, discovering that synonyms that accurately represent\ncore ideas, are easy to understand, elicit emotional responses, and are\ncommonly encountered, promote greater user engagement. Crucially, our work\noffers a fresh lens on human-computer interaction, digital behaviors, and\ndecision-making processes. Our results highlight the promise of cognitive\nbiases as potent indicators of user engagement, underscoring their significance\nin designing effective digital content across fields like education and\nmarketing.\n","authors":["Nimrod Dvir","Elaine Friedman","Suraj Commuri","Fan Yang","Jennifer Romano"],"pdf_url":"https://arxiv.org/pdf/2307.14511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14500v1","updated":"2023-07-26T20:58:47Z","published":"2023-07-26T20:58:47Z","title":"A Predictive Model of Digital Information Engagement: Forecasting User\n  Engagement With English Words by Incorporating Cognitive Biases,\n  Computational Linguistics and Natural Language Processing","summary":"  This study introduces and empirically tests a novel predictive model for\ndigital information engagement (IE) - the READ model, an acronym for the four\npivotal attributes of engaging information: Representativeness, Ease-of-use,\nAffect, and Distribution. Conceptualized within the theoretical framework of\nCumulative Prospect Theory, the model integrates key cognitive biases with\ncomputational linguistics and natural language processing to develop a\nmultidimensional perspective on information engagement. A rigorous testing\nprotocol was implemented, involving 50 randomly selected pairs of synonymous\nwords (100 words in total) from the WordNet database. These words' engagement\nlevels were evaluated through a large-scale online survey (n = 80,500) to\nderive empirical IE metrics. The READ attributes for each word were then\ncomputed and their predictive efficacy examined. The findings affirm the READ\nmodel's robustness, accurately predicting a word's IE level and distinguishing\nthe more engaging word from a pair of synonyms with an 84% accuracy rate. The\nREAD model's potential extends across various domains, including business,\neducation, government, and healthcare, where it could enhance content\nengagement and inform AI language model development and generative text work.\nFuture research should address the model's scalability and adaptability across\ndifferent domains and languages, thereby broadening its applicability and\nefficacy.\n","authors":["Nimrod Dvir","Elaine Friedman","Suraj Commuri","Fan yang","Jennifer Romano"],"pdf_url":"https://arxiv.org/pdf/2307.14500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16406v2","updated":"2023-07-26T20:53:22Z","published":"2023-05-25T18:18:09Z","title":"Context-aware attention layers coupled with optimal transport domain\n  adaptation and multimodal fusion methods for recognizing dementia from\n  spontaneous speech","summary":"  Alzheimer's disease (AD) constitutes a complex neurocognitive disease and is\nthe main cause of dementia. Although many studies have been proposed targeting\nat diagnosing dementia through spontaneous speech, there are still limitations.\nExisting state-of-the-art approaches, which propose multimodal methods, train\nseparately language and acoustic models, employ majority-vote approaches, and\nconcatenate the representations of the different modalities either at the input\nlevel, i.e., early fusion, or during training. Also, some of them employ\nself-attention layers, which calculate the dependencies between representations\nwithout considering the contextual information. In addition, no prior work has\ntaken into consideration the model calibration. To address these limitations,\nwe propose some new methods for detecting AD patients, which capture the intra-\nand cross-modal interactions. First, we convert the audio files into log-Mel\nspectrograms, their delta, and delta-delta and create in this way an image per\naudio file consisting of three channels. Next, we pass each transcript and\nimage through BERT and DeiT models respectively. After that, context-based\nself-attention layers, self-attention layers with a gate model, and optimal\ntransport domain adaptation methods are employed for capturing the intra- and\ninter-modal interactions. Finally, we exploit two methods for fusing the self\nand cross-attention features. For taking into account the model calibration, we\napply label smoothing. We use both performance and calibration metrics.\nExperiments conducted on the ADReSS and ADReSSo Challenge datasets indicate the\nefficacy of our introduced approaches over existing research initiatives with\nour best performing model reaching Accuracy and F1-score up to 91.25% and\n91.06% respectively.\n","authors":["Loukas Ilias","Dimitris Askounis"],"pdf_url":"https://arxiv.org/pdf/2305.16406v2.pdf","comment":"Knowledge-Based Systems"},{"id":"http://arxiv.org/abs/2305.15299v3","updated":"2023-07-26T20:35:40Z","published":"2023-05-24T16:23:46Z","title":"Science in the Era of ChatGPT, Large Language Models and Generative AI:\n  Challenges for Research Ethics and How to Respond","summary":"  Large language models of artificial intelligence (AI), such as ChatGPT, find\nremarkable but controversial applicability in science and research. This paper\nreviews epistemological challenges, ethical and integrity risks in science\nconduct in the advent of generative AI. This is with the aim to lay new timely\nfoundations for a high-quality research ethics review. The role of AI language\nmodels as a research instrument and subject is scrutinized along with ethical\nimplications for scientists, participants and reviewers. New emerging practices\nfor research ethics review are discussed, concluding with ten recommendations\nthat shape a response for a more responsible research conduct in the era of AI.\n","authors":["Evangelos Pournaras"],"pdf_url":"https://arxiv.org/pdf/2305.15299v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19472v2","updated":"2023-07-26T19:45:19Z","published":"2023-05-31T00:55:40Z","title":"PlaSma: Making Small Language Models Better Procedural Knowledge Models\n  for (Counterfactual) Planning","summary":"  Procedural planning, which entails decomposing a high-level goal into a\nsequence of temporally ordered steps, is an important yet intricate task for\nmachines. It involves integrating common-sense knowledge to reason about\ncomplex contextualized situations that are often counterfactual, e.g.\n\"scheduling a doctor's appointment without a phone\". While current approaches\nshow encouraging results using large language models (LLMs), they are hindered\nby drawbacks such as costly API calls and reproducibility issues. In this\npaper, we advocate planning using smaller language models. We present PlaSma, a\nnovel two-pronged approach to endow small language models with procedural\nknowledge and (counterfactual) planning capabilities. More concretely, we\ndevelop symbolic procedural knowledge distillation to enhance the implicit\nknowledge in small language models and an inference-time algorithm to\nfacilitate more structured and accurate reasoning. In addition, we introduce a\nnovel task, Counterfactual Planning, that requires a revision of a plan to cope\nwith a counterfactual situation. In both the original and counterfactual\nsetting, we show that orders-of-magnitude smaller models (770M-11B parameters)\ncan compete and often surpass their larger teacher models' capabilities.\n","authors":["Faeze Brahman","Chandra Bhagavatula","Valentina Pyatkin","Jena D. Hwang","Xiang Lorraine Li","Hirona J. Arai","Soumya Sanyal","Keisuke Sakaguchi","Xiang Ren","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2305.19472v2.pdf","comment":"cited new paper, 27 pages"},{"id":"http://arxiv.org/abs/2307.14440v1","updated":"2023-07-26T18:16:45Z","published":"2023-07-26T18:16:45Z","title":"Controllable Generation of Dialogue Acts for Dialogue Systems via\n  Few-Shot Response Generation and Ranking","summary":"  Dialogue systems need to produce responses that realize multiple types of\ndialogue acts (DAs) with high semantic fidelity. In the past, natural language\ngenerators (NLGs) for dialogue were trained on large parallel corpora that map\nfrom a domain-specific DA and its semantic attributes to an output utterance.\nRecent work shows that pretrained language models (LLMs) offer new\npossibilities for controllable NLG using prompt-based learning. Here we develop\na novel few-shot overgenerate-and-rank approach that achieves the controlled\ngeneration of DAs. We compare eight few-shot prompt styles that include a novel\nmethod of generating from textual pseudo-references using a textual style\ntransfer approach. We develop six automatic ranking functions that identify\noutputs with both the correct DA and high semantic accuracy at generation time.\nWe test our approach on three domains and four LLMs. To our knowledge, this is\nthe first work on NLG for dialogue that automatically ranks outputs using both\nDA and attribute accuracy. For completeness, we compare our results to\nfine-tuned few-shot models trained with 5 to 100 instances per DA. Our results\nshow that several prompt settings achieve perfect DA accuracy, and near perfect\nsemantic accuracy (99.81%) and perform better than few-shot fine-tuning.\n","authors":["Angela Ramirez","Karik Agarwal","Juraj Juraska","Utkarsh Garg","Marilyn A. Walker"],"pdf_url":"https://arxiv.org/pdf/2307.14440v1.pdf","comment":"To Appear in SIGDIAL 2023. Proceedings of the 24th Annual Meeting of\n  the Special Interest Group on Discourse and Dialogue. 2023"},{"id":"http://arxiv.org/abs/2307.03952v2","updated":"2023-07-26T18:02:42Z","published":"2023-07-08T11:02:02Z","title":"Is ChatGPT a Good Personality Recognizer? A Preliminary Study","summary":"  In recent years, personality has been regarded as a valuable personal factor\nbeing incorporated into numerous tasks such as sentiment analysis and product\nrecommendation. This has led to widespread attention to text-based personality\nrecognition task, which aims to identify an individual's personality based on\ngiven text. Considering that ChatGPT has recently exhibited remarkable\nabilities on various natural language processing tasks, we provide a\npreliminary evaluation of ChatGPT on text-based personality recognition task\nfor generating effective personality data. Concretely, we employ a variety of\nprompting strategies to explore ChatGPT's ability in recognizing personality\nfrom given text, especially the level-oriented prompting strategy we designed\nfor guiding ChatGPT in analyzing given text at a specified level. The\nexperimental results on two representative real-world datasets reveal that\nChatGPT with zero-shot chain-of-thought prompting exhibits impressive\npersonality recognition ability and is capable to provide natural language\nexplanations through text-based logical reasoning. Furthermore, by employing\nthe level-oriented prompting strategy to optimize zero-shot chain-of-thought\nprompting, the performance gap between ChatGPT and corresponding\nstate-of-the-art model has been narrowed even more. However, we observe that\nChatGPT shows unfairness towards certain sensitive demographic attributes such\nas gender and age. Additionally, we discover that eliciting the personality\nrecognition ability of ChatGPT helps improve its performance on\npersonality-related downstream tasks such as sentiment classification and\nstress prediction.\n","authors":["Yu Ji","Wen Wu","Hong Zheng","Yi Hu","Xi Chen","Liang He"],"pdf_url":"https://arxiv.org/pdf/2307.03952v2.pdf","comment":"15 pages, 13 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.14430v1","updated":"2023-07-26T18:01:49Z","published":"2023-07-26T18:01:49Z","title":"Skill-it! A Data-Driven Skills Framework for Understanding and Training\n  Language Models","summary":"  The quality of training data impacts the performance of pre-trained large\nlanguage models (LMs). Given a fixed budget of tokens, we study how to best\nselect data that leads to good downstream model performance across tasks. We\ndevelop a new framework based on a simple hypothesis: just as humans acquire\ninterdependent skills in a deliberate order, language models also follow a\nnatural order when learning a set of skills from their training data. If such\nan order exists, it can be utilized for improved understanding of LMs and for\ndata-efficient training. Using this intuition, our framework formalizes the\nnotion of a skill and of an ordered set of skills in terms of the associated\ndata. First, using both synthetic and real data, we demonstrate that these\nordered skill sets exist, and that their existence enables more advanced skills\nto be learned with less data when we train on their prerequisite skills.\nSecond, using our proposed framework, we introduce an online data sampling\nalgorithm, Skill-It, over mixtures of skills for both continual pre-training\nand fine-tuning regimes, where the objective is to efficiently learn multiple\nskills in the former and an individual skill in the latter. On the LEGO\nsynthetic in the continual pre-training setting, Skill-It obtains 36.5 points\nhigher accuracy than random sampling. On the Natural Instructions dataset in\nthe fine-tuning setting, Skill-It reduces the validation loss on the target\nskill by 13.6% versus training on data associated with the target skill itself.\nWe apply our skills framework on the recent RedPajama dataset to continually\npre-train a 3B-parameter LM, achieving higher accuracy on the LM Evaluation\nHarness with 1B tokens than the baseline approach of sampling uniformly over\ndata sources with 3B tokens.\n","authors":["Mayee F. Chen","Nicholas Roberts","Kush Bhatia","Jue Wang","Ce Zhang","Frederic Sala","Christopher Ré"],"pdf_url":"https://arxiv.org/pdf/2307.14430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14389v1","updated":"2023-07-26T07:12:39Z","published":"2023-07-26T07:12:39Z","title":"Diff-E: Diffusion-based Learning for Decoding Imagined Speech EEG","summary":"  Decoding EEG signals for imagined speech is a challenging task due to the\nhigh-dimensional nature of the data and low signal-to-noise ratio. In recent\nyears, denoising diffusion probabilistic models (DDPMs) have emerged as\npromising approaches for representation learning in various domains. Our study\nproposes a novel method for decoding EEG signals for imagined speech using\nDDPMs and a conditional autoencoder named Diff-E. Results indicate that Diff-E\nsignificantly improves the accuracy of decoding EEG signals for imagined speech\ncompared to traditional machine learning techniques and baseline models. Our\nfindings suggest that DDPMs can be an effective tool for EEG signal decoding,\nwith potential implications for the development of brain-computer interfaces\nthat enable communication through imagined speech.\n","authors":["Soowon Kim","Young-Eun Lee","Seo-Hyun Lee","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2307.14389v1.pdf","comment":"Accepted to Interspeech 2023"},{"id":"http://arxiv.org/abs/2307.14385v1","updated":"2023-07-26T06:00:50Z","published":"2023-07-26T06:00:50Z","title":"Leveraging Large Language Models for Mental Health Prediction via Online\n  Text Data","summary":"  The recent technology boost of large language models (LLMs) has empowered a\nvariety of applications. However, there is very little research on\nunderstanding and improving LLMs' capability for the mental health domain. In\nthis work, we present the first comprehensive evaluation of multiple LLMs,\nincluding Alpaca, Alpaca-LoRA, and GPT-3.5, on various mental health prediction\ntasks via online text data. We conduct a wide range of experiments, covering\nzero-shot prompting, few-shot prompting, and instruction finetuning. The\nresults indicate the promising yet limited performance of LLMs with zero-shot\nand few-shot prompt designs for mental health tasks. More importantly, our\nexperiments show that instruction finetuning can significantly boost the\nperformance of LLMs for all tasks simultaneously. Our best-finetuned model,\nMental-Alpaca, outperforms GPT-3.5 (25 times bigger) by 16.7\\% on balanced\naccuracy and performs on par with the state-of-the-art task-specific model. We\nsummarize our findings into a set of action guidelines for future researchers,\nengineers, and practitioners on how to empower LLMs with better mental health\ndomain knowledge and become an expert in mental health prediction tasks.\n","authors":["Xuhai Xu","Bingshen Yao","Yuanzhe Dong","Hong Yu","James Hendler","Anind K. Dey","Dakuo Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14385v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.14341v1","updated":"2023-07-26T17:59:20Z","published":"2023-07-26T17:59:20Z","title":"Virtual Mirrors: Non-Line-of-Sight Imaging Beyond the Third Bounce","summary":"  Non-line-of-sight (NLOS) imaging methods are capable of reconstructing\ncomplex scenes that are not visible to an observer using indirect illumination.\nHowever, they assume only third-bounce illumination, so they are currently\nlimited to single-corner configurations, and present limited visibility when\nimaging surfaces at certain orientations. To reason about and tackle these\nlimitations, we make the key observation that planar diffuse surfaces behave\nspecularly at wavelengths used in the computational wave-based NLOS imaging\ndomain. We call such surfaces virtual mirrors. We leverage this observation to\nexpand the capabilities of NLOS imaging using illumination beyond the third\nbounce, addressing two problems: imaging single-corner objects at limited\nvisibility angles, and imaging objects hidden behind two corners. To image\nobjects at limited visibility angles, we first analyze the reflections of the\nknown illuminated point on surfaces of the scene as an estimator of the\nposition and orientation of objects with limited visibility. We then image\nthose limited visibility objects by computationally building secondary\napertures at other surfaces that observe the target object from a direct\nvisibility perspective. Beyond single-corner NLOS imaging, we exploit the\nspecular behavior of virtual mirrors to image objects hidden behind a second\ncorner by imaging the space behind such virtual mirrors, where the mirror image\nof objects hidden around two corners is formed. No specular surfaces were\ninvolved in the making of this paper.\n","authors":["Diego Royo","Talha Sultan","Adolfo Muñoz","Khadijeh Masumnia-Bisheh","Eric Brandt","Diego Gutierrez","Andreas Velten","Julio Marco"],"pdf_url":"https://arxiv.org/pdf/2307.14341v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14336v1","updated":"2023-07-26T17:55:32Z","published":"2023-07-26T17:55:32Z","title":"MAMo: Leveraging Memory and Attention for Monocular Video Depth\n  Estimation","summary":"  We propose MAMo, a novel memory and attention frame-work for monocular video\ndepth estimation. MAMo can augment and improve any single-image depth\nestimation networks into video depth estimation models, enabling them to take\nadvantage of the temporal information to predict more accurate depth. In MAMo,\nwe augment model with memory which aids the depth prediction as the model\nstreams through the video. Specifically, the memory stores learned visual and\ndisplacement tokens of the previous time instances. This allows the depth\nnetwork to cross-reference relevant features from the past when predicting\ndepth on the current frame. We introduce a novel scheme to continuously update\nthe memory, optimizing it to keep tokens that correspond with both the past and\nthe present visual information. We adopt attention-based approach to process\nmemory features where we first learn the spatio-temporal relation among the\nresultant visual and displacement memory tokens using self-attention module.\nFurther, the output features of self-attention are aggregated with the current\nvisual features through cross-attention. The cross-attended features are\nfinally given to a decoder to predict depth on the current frame. Through\nextensive experiments on several benchmarks, including KITTI, NYU-Depth V2, and\nDDAD, we show that MAMo consistently improves monocular depth estimation\nnetworks and sets new state-of-the-art (SOTA) accuracy. Notably, our MAMo video\ndepth estimation provides higher accuracy with lower latency, when omparing to\nSOTA cost-volume-based video depth models.\n","authors":["Rajeev Yasarla","Hong Cai","Jisoo Jeong","Yunxiao Shi","Risheek Garrepalli","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2307.14336v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14334v1","updated":"2023-07-26T17:52:22Z","published":"2023-07-26T17:52:22Z","title":"Towards Generalist Biomedical AI","summary":"  Medicine is inherently multimodal, with rich data modalities spanning text,\nimaging, genomics, and more. Generalist biomedical artificial intelligence (AI)\nsystems that flexibly encode, integrate, and interpret this data at scale can\npotentially enable impactful applications ranging from scientific discovery to\ncare delivery. To enable the development of these models, we first curate\nMultiMedBench, a new multimodal biomedical benchmark. MultiMedBench encompasses\n14 diverse tasks such as medical question answering, mammography and\ndermatology image interpretation, radiology report generation and\nsummarization, and genomic variant calling. We then introduce Med-PaLM\nMultimodal (Med-PaLM M), our proof of concept for a generalist biomedical AI\nsystem. Med-PaLM M is a large multimodal generative model that flexibly encodes\nand interprets biomedical data including clinical language, imaging, and\ngenomics with the same set of model weights. Med-PaLM M reaches performance\ncompetitive with or exceeding the state of the art on all MultiMedBench tasks,\noften surpassing specialist models by a wide margin. We also report examples of\nzero-shot generalization to novel medical concepts and tasks, positive transfer\nlearning across tasks, and emergent zero-shot medical reasoning. To further\nprobe the capabilities and limitations of Med-PaLM M, we conduct a radiologist\nevaluation of model-generated (and human) chest X-ray reports and observe\nencouraging performance across model scales. In a side-by-side ranking on 246\nretrospective chest X-rays, clinicians express a pairwise preference for\nMed-PaLM M reports over those produced by radiologists in up to 40.50% of\ncases, suggesting potential clinical utility. While considerable work is needed\nto validate these models in real-world use cases, our results represent a\nmilestone towards the development of generalist biomedical AI systems.\n","authors":["Tao Tu","Shekoofeh Azizi","Danny Driess","Mike Schaekermann","Mohamed Amin","Pi-Chuan Chang","Andrew Carroll","Chuck Lau","Ryutaro Tanno","Ira Ktena","Basil Mustafa","Aakanksha Chowdhery","Yun Liu","Simon Kornblith","David Fleet","Philip Mansfield","Sushant Prakash","Renee Wong","Sunny Virmani","Christopher Semturs","S Sara Mahdavi","Bradley Green","Ewa Dominowska","Blaise Aguera y Arcas","Joelle Barral","Dale Webster","Greg S. Corrado","Yossi Matias","Karan Singhal","Pete Florence","Alan Karthikesalingam","Vivek Natarajan"],"pdf_url":"https://arxiv.org/pdf/2307.14334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14332v1","updated":"2023-07-26T17:50:17Z","published":"2023-07-26T17:50:17Z","title":"Event-based Vision for Early Prediction of Manipulation Actions","summary":"  Neuromorphic visual sensors are artificial retinas that output sequences of\nasynchronous events when brightness changes occur in the scene. These sensors\noffer many advantages including very high temporal resolution, no motion blur\nand smart data compression ideal for real-time processing. In this study, we\nintroduce an event-based dataset on fine-grained manipulation actions and\nperform an experimental study on the use of transformers for action prediction\nwith events. There is enormous interest in the fields of cognitive robotics and\nhuman-robot interaction on understanding and predicting human actions as early\nas possible. Early prediction allows anticipating complex stages for planning,\nenabling effective and real-time interaction. Our Transformer network uses\nevents to predict manipulation actions as they occur, using online inference.\nThe model succeeds at predicting actions early on, building up confidence over\ntime and achieving state-of-the-art classification. Moreover, the\nattention-based transformer architecture allows us to study the role of the\nspatio-temporal patterns selected by the model. Our experiments show that the\nTransformer network captures action dynamic features outperforming video-based\napproaches and succeeding with scenarios where the differences between actions\nlie in very subtle cues. Finally, we release the new event dataset, which is\nthe first in the literature for manipulation action recognition. Code will be\navailable at https://github.com/DaniDeniz/EventVisionTransformer.\n","authors":["Daniel Deniz","Cornelia Fermuller","Eduardo Ros","Manuel Rodriguez-Alvarez","Francisco Barranco"],"pdf_url":"https://arxiv.org/pdf/2307.14332v1.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.14331v1","updated":"2023-07-26T17:50:10Z","published":"2023-07-26T17:50:10Z","title":"Visual Instruction Inversion: Image Editing via Visual Prompting","summary":"  Text-conditioned image editing has emerged as a powerful tool for editing\nimages. However, in many situations, language can be ambiguous and ineffective\nin describing specific image edits. When faced with such challenges, visual\nprompts can be a more informative and intuitive way to convey ideas. We present\na method for image editing via visual prompting. Given pairs of example that\nrepresent the \"before\" and \"after\" images of an edit, our goal is to learn a\ntext-based editing direction that can be used to perform the same edit on new\nimages. We leverage the rich, pretrained editing capabilities of text-to-image\ndiffusion models by inverting visual prompts into editing instructions. Our\nresults show that with just one example pair, we can achieve competitive\nresults compared to state-of-the-art text-conditioned image editing frameworks.\n","authors":["Thao Nguyen","Yuheng Li","Utkarsh Ojha","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2307.14331v1.pdf","comment":"Project page: https://thaoshibe.github.io/visii/"},{"id":"http://arxiv.org/abs/2304.08870v2","updated":"2023-07-26T17:13:12Z","published":"2023-04-18T10:05:37Z","title":"UPGPT: Universal Diffusion Model for Person Image Generation, Editing\n  and Pose Transfer","summary":"  Text-to-image models (T2I) such as StableDiffusion have been used to generate\nhigh quality images of people. However, due to the random nature of the\ngeneration process, the person has a different appearance e.g. pose, face, and\nclothing, despite using the same text prompt. The appearance inconsistency\nmakes T2I unsuitable for pose transfer. We address this by proposing a\nmultimodal diffusion model that accepts text, pose, and visual prompting. Our\nmodel is the first unified method to perform all person image tasks -\ngeneration, pose transfer, and mask-less edit. We also pioneer using small\ndimensional 3D body model parameters directly to demonstrate new capability -\nsimultaneous pose and camera view interpolation while maintaining the person's\nappearance.\n","authors":["Soon Yau Cheong","Armin Mustafa","Andrew Gilbert"],"pdf_url":"https://arxiv.org/pdf/2304.08870v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14294v1","updated":"2023-07-26T16:51:18Z","published":"2023-07-26T16:51:18Z","title":"Unraveling the Complexity of Splitting Sequential Data: Tackling\n  Challenges in Video and Time Series Analysis","summary":"  Splitting of sequential data, such as videos and time series, is an essential\nstep in various data analysis tasks, including object tracking and anomaly\ndetection. However, splitting sequential data presents a variety of challenges\nthat can impact the accuracy and reliability of subsequent analyses. This\nconcept article examines the challenges associated with splitting sequential\ndata, including data acquisition, data representation, split ratio selection,\nsetting up quality criteria, and choosing suitable selection strategies. We\nexplore these challenges through two real-world examples: motor test benches\nand particle tracking in liquids.\n","authors":["Diego Botache","Kristina Dingel","Rico Huhnstock","Arno Ehresmann","Bernhard Sick"],"pdf_url":"https://arxiv.org/pdf/2307.14294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14288v1","updated":"2023-07-26T16:43:22Z","published":"2023-07-26T16:43:22Z","title":"US & MR Image-Fusion Based on Skin Co-Registration","summary":"  The study and development of innovative solutions for the advanced\nvisualisation, representation and analysis of medical images offer different\nresearch directions. Current practice in medical imaging consists in combining\nreal-time US with imaging modalities that allow internal anatomy acquisitions,\nsuch as CT, MRI, PET or similar. Application of image-fusion approaches can be\nfound in tracking surgical tools and/or needles, in real-time during\ninterventions. Thus, this work proposes a fusion imaging system for the\nregistration of CT and MRI images with real-time US acquisition leveraging a 3D\ncamera sensor. The main focus of the work is the portability of the system and\nits applicability to different anatomical districts.\n","authors":["Martina Paccini","Giacomo Paschina","Stefano De Beni","Giuseppe Patanè"],"pdf_url":"https://arxiv.org/pdf/2307.14288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.03829v2","updated":"2023-07-26T16:26:52Z","published":"2022-10-07T21:49:26Z","title":"Early Detection of Bark Beetle Attack Using Remote Sensing and Machine\n  Learning: A Review","summary":"  This paper provides a comprehensive review of past and current advances in\nthe early detection of bark beetle-induced tree mortality from three primary\nperspectives: bark beetle & host interactions, RS, and ML/DL. In contrast to\nprior efforts, this review encompasses all RS systems and emphasizes ML/DL\nmethods to investigate their strengths and weaknesses. We parse existing\nliterature based on multi- or hyper-spectral analyses and distill their\nknowledge based on: bark beetle species & attack phases with a primary emphasis\non early stages of attacks, host trees, study regions, RS platforms & sensors,\nspectral/spatial/temporal resolutions, spectral signatures, spectral vegetation\nindices (SVIs), ML approaches, learning schemes, task categories, models,\nalgorithms, classes/clusters, features, and DL networks & architectures.\nAlthough DL-based methods and the random forest (RF) algorithm showed promising\nresults, highlighting their potential to detect subtle changes across visible,\nthermal, and short-wave infrared (SWIR) spectral regions, they still have\nlimited effectiveness and high uncertainties. To inspire novel solutions to\nthese shortcomings, we delve into the principal challenges & opportunities from\ndifferent perspectives, enabling a deeper understanding of the current state of\nresearch and guiding future research directions.\n","authors":["Seyed Mojtaba Marvasti-Zadeh","Devin Goodsman","Nilanjan Ray","Nadir Erbilgin"],"pdf_url":"https://arxiv.org/pdf/2210.03829v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2307.14278v1","updated":"2023-07-26T16:19:19Z","published":"2023-07-26T16:19:19Z","title":"Large-scale Fully-Unsupervised Re-Identification","summary":"  Fully-unsupervised Person and Vehicle Re-Identification have received\nincreasing attention due to their broad applicability in surveillance,\nforensics, event understanding, and smart cities, without requiring any manual\nannotation. However, most of the prior art has been evaluated in datasets that\nhave just a couple thousand samples. Such small-data setups often allow the use\nof costly techniques in time and memory footprints, such as Re-Ranking, to\nimprove clustering results. Moreover, some previous work even pre-selects the\nbest clustering hyper-parameters for each dataset, which is unrealistic in a\nlarge-scale fully-unsupervised scenario. In this context, this work tackles a\nmore realistic scenario and proposes two strategies to learn from large-scale\nunlabeled data. The first strategy performs a local neighborhood sampling to\nreduce the dataset size in each iteration without violating neighborhood\nrelationships. A second strategy leverages a novel Re-Ranking technique, which\nhas a lower time upper bound complexity and reduces the memory complexity from\nO(n^2) to O(kn) with k << n. To avoid the pre-selection of specific\nhyper-parameter values for the clustering algorithm, we also present a novel\nscheduling algorithm that adjusts the density parameter during training, to\nleverage the diversity of samples and keep the learning robust to noisy\nlabeling. Finally, due to the complementary knowledge learned by different\nmodels, we also introduce a co-training strategy that relies upon the\npermutation of predicted pseudo-labels, among the backbones, with no need for\nany hyper-parameters or weighting optimization. The proposed methodology\noutperforms the state-of-the-art methods in well-known benchmarks and in the\nchallenging large-scale Veri-Wild dataset, with a faster and memory-efficient\nRe-Ranking strategy, and a large-scale, noisy-robust, and ensemble-based\nlearning approach.\n","authors":["Gabriel Bertocco","Fernanda Andaló","Terrance E. Boult","Anderson Rocha"],"pdf_url":"https://arxiv.org/pdf/2307.14278v1.pdf","comment":"This paper has been submitted for possible publication in an IEEE\n  Transactions"},{"id":"http://arxiv.org/abs/2307.14277v1","updated":"2023-07-26T16:14:21Z","published":"2023-07-26T16:14:21Z","title":"G2L: Semantically Aligned and Uniform Video Grounding via Geodesic and\n  Game Theory","summary":"  The recent video grounding works attempt to introduce vanilla contrastive\nlearning into video grounding. However, we claim that this naive solution is\nsuboptimal. Contrastive learning requires two key properties: (1)\n\\emph{alignment} of features of similar samples, and (2) \\emph{uniformity} of\nthe induced distribution of the normalized features on the hypersphere. Due to\ntwo annoying issues in video grounding: (1) the co-existence of some visual\nentities in both ground truth and other moments, \\ie semantic overlapping; (2)\nonly a few moments in the video are annotated, \\ie sparse annotation dilemma,\nvanilla contrastive learning is unable to model the correlations between\ntemporally distant moments and learned inconsistent video representations. Both\ncharacteristics lead to vanilla contrastive learning being unsuitable for video\ngrounding. In this paper, we introduce Geodesic and Game Localization (G2L), a\nsemantically aligned and uniform video grounding framework via geodesic and\ngame theory. We quantify the correlations among moments leveraging the geodesic\ndistance that guides the model to learn the correct cross-modal\nrepresentations. Furthermore, from the novel perspective of game theory, we\npropose semantic Shapley interaction based on geodesic distance sampling to\nlearn fine-grained semantic alignment in similar moments. Experiments on three\nbenchmarks demonstrate the effectiveness of our method.\n","authors":["Hongxiang Li","Meng Cao","Xuxin Cheng","Yaowei Li","Zhihong Zhu","Yuexian Zou"],"pdf_url":"https://arxiv.org/pdf/2307.14277v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2307.14273v1","updated":"2023-07-26T16:11:51Z","published":"2023-07-26T16:11:51Z","title":"Deepfake Image Generation for Improved Brain Tumor Segmentation","summary":"  As the world progresses in technology and health, awareness of disease by\nrevealing asymptomatic signs improves. It is important to detect and treat\ntumors in early stage as it can be life-threatening. Computer-aided\ntechnologies are used to overcome lingering limitations facing disease\ndiagnosis, while brain tumor segmentation remains a difficult process,\nespecially when multi-modality data is involved. This is mainly attributed to\nineffective training due to lack of data and corresponding labelling. This work\ninvestigates the feasibility of employing deep-fake image generation for\neffective brain tumor segmentation. To this end, a Generative Adversarial\nNetwork was used for image-to-image translation for increasing dataset size,\nfollowed by image segmentation using a U-Net-based convolutional neural network\ntrained with deepfake images. Performance of the proposed approach is compared\nwith ground truth of four publicly available datasets. Results show improved\nperformance in terms of image segmentation quality metrics, and could\npotentially assist when training with limited data.\n","authors":["Roa'a Al-Emaryeen","Sara Al-Nahhas","Fatima Himour","Waleed Mahafza","Omar Al-Kadi"],"pdf_url":"https://arxiv.org/pdf/2307.14273v1.pdf","comment":"6 pages, 8 figures, 2 tables, conference paper"},{"id":"http://arxiv.org/abs/2307.06281v2","updated":"2023-07-26T16:02:57Z","published":"2023-07-12T16:23:09Z","title":"MMBench: Is Your Multi-modal Model an All-around Player?","summary":"  Large vision-language models have recently achieved remarkable progress,\nexhibiting great perception and reasoning abilities concerning visual\ninformation. However, how to effectively evaluate these large vision-language\nmodels remains a major obstacle, hindering future model development.\nTraditional benchmarks like VQAv2 or COCO Caption provide quantitative\nperformance measurements but suffer from a lack of fine-grained ability\nassessment and non-robust evaluation metrics. Recent subjective benchmarks,\nsuch as OwlEval, offer comprehensive evaluations of a model's abilities by\nincorporating human labor, but they are not scalable and display significant\nbias. In response to these challenges, we propose MMBench, a novel\nmulti-modality benchmark. MMBench methodically develops a comprehensive\nevaluation pipeline, primarily comprised of two elements. The first element is\na meticulously curated dataset that surpasses existing similar benchmarks in\nterms of the number and variety of evaluation questions and abilities. The\nsecond element introduces a novel CircularEval strategy and incorporates the\nuse of ChatGPT. This implementation is designed to convert free-form\npredictions into pre-defined choices, thereby facilitating a more robust\nevaluation of the model's predictions. MMBench is a systematically-designed\nobjective benchmark for robustly evaluating the various abilities of\nvision-language models. We hope MMBench will assist the research community in\nbetter evaluating their models and encourage future advancements in this\ndomain. Project page: https://opencompass.org.cn/mmbench.\n","authors":["Yuan Liu","Haodong Duan","Yuanhan Zhang","Bo Li","Songyang Zhang","Wangbo Zhao","Yike Yuan","Jiaqi Wang","Conghui He","Ziwei Liu","Kai Chen","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2307.06281v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01188v2","updated":"2023-07-26T15:58:35Z","published":"2023-06-01T22:57:32Z","title":"Event-based Stereo Visual Odometry with Native Temporal Resolution via\n  Continuous-time Gaussian Process Regression","summary":"  Event-based cameras asynchronously capture individual visual changes in a\nscene. This makes them more robust than traditional frame-based cameras to\nhighly dynamic motions and poor illumination. It also means that every\nmeasurement in a scene can occur at a unique time.\n  Handling these different measurement times is a major challenge of using\nevent-based cameras. It is often addressed in visual odometry (VO) pipelines by\napproximating temporally close measurements as occurring at one common time.\nThis grouping simplifies the estimation problem but, absent additional sensors,\nsacrifices the inherent temporal resolution of event-based cameras.\n  This paper instead presents a complete stereo VO pipeline that estimates\ndirectly with individual event-measurement times without requiring any grouping\nor approximation in the estimation state. It uses continuous-time trajectory\nestimation to maintain the temporal fidelity and asynchronous nature of\nevent-based cameras through Gaussian process regression with a physically\nmotivated prior. Its performance is evaluated on the MVSEC dataset, where it\nachieves 7.9e-3 and 5.9e-3 RMS relative error on two independent sequences,\noutperforming the existing publicly available event-based stereo VO pipeline by\ntwo and four times, respectively.\n","authors":["Jianeng Wang","Jonathan D. Gammell"],"pdf_url":"https://arxiv.org/pdf/2306.01188v2.pdf","comment":"Submitted to IEEE Robotics and Automation Letters (RA-L). Manuscript\n  #23-1314. 8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.14262v1","updated":"2023-07-26T15:50:02Z","published":"2023-07-26T15:50:02Z","title":"Artifact Restoration in Histology Images with Diffusion Probabilistic\n  Models","summary":"  Histological whole slide images (WSIs) can be usually compromised by\nartifacts, such as tissue folding and bubbles, which will increase the\nexamination difficulty for both pathologists and Computer-Aided Diagnosis (CAD)\nsystems. Existing approaches to restoring artifact images are confined to\nGenerative Adversarial Networks (GANs), where the restoration process is\nformulated as an image-to-image transfer. Those methods are prone to suffer\nfrom mode collapse and unexpected mistransfer in the stain style, leading to\nunsatisfied and unrealistic restored images. Innovatively, we make the first\nattempt at a denoising diffusion probabilistic model for histological artifact\nrestoration, namely ArtiFusion.Specifically, ArtiFusion formulates the artifact\nregion restoration as a gradual denoising process, and its training relies\nsolely on artifact-free images to simplify the training complexity.Furthermore,\nto capture local-global correlations in the regional artifact restoration, a\nnovel Swin-Transformer denoising architecture is designed, along with a time\ntoken scheme. Our extensive evaluations demonstrate the effectiveness of\nArtiFusion as a pre-processing method for histology analysis, which can\nsuccessfully preserve the tissue structures and stain style in artifact-free\nregions during the restoration. Code is available at\nhttps://github.com/zhenqi-he/ArtiFusion.\n","authors":["Zhenqi He","Junjun He","Jin Ye","Yiqing Shen"],"pdf_url":"https://arxiv.org/pdf/2307.14262v1.pdf","comment":"Accepted by MICCAI2023"},{"id":"http://arxiv.org/abs/2211.05783v3","updated":"2023-07-26T15:42:58Z","published":"2022-11-10T18:59:54Z","title":"Unifying Flow, Stereo and Depth Estimation","summary":"  We present a unified formulation and model for three motion and 3D perception\ntasks: optical flow, rectified stereo matching and unrectified stereo depth\nestimation from posed images. Unlike previous specialized architectures for\neach specific task, we formulate all three tasks as a unified dense\ncorrespondence matching problem, which can be solved with a single model by\ndirectly comparing feature similarities. Such a formulation calls for\ndiscriminative feature representations, which we achieve using a Transformer,\nin particular the cross-attention mechanism. We demonstrate that\ncross-attention enables integration of knowledge from another image via\ncross-view interactions, which greatly improves the quality of the extracted\nfeatures. Our unified model naturally enables cross-task transfer since the\nmodel architecture and parameters are shared across tasks. We outperform RAFT\nwith our unified model on the challenging Sintel dataset, and our final model\nthat uses a few additional task-specific refinement steps outperforms or\ncompares favorably to recent state-of-the-art methods on 10 popular flow,\nstereo and depth datasets, while being simpler and more efficient in terms of\nmodel design and inference speed.\n","authors":["Haofei Xu","Jing Zhang","Jianfei Cai","Hamid Rezatofighi","Fisher Yu","Dacheng Tao","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2211.05783v3.pdf","comment":"TPAMI 2023, Project Page: https://haofeixu.github.io/unimatch, Code:\n  https://github.com/autonomousvision/unimatch, Demo:\n  https://huggingface.co/spaces/haofeixu/unimatch"},{"id":"http://arxiv.org/abs/2307.14253v1","updated":"2023-07-26T15:33:35Z","published":"2023-07-26T15:33:35Z","title":"Sparse Double Descent in Vision Transformers: real or phantom threat?","summary":"  Vision transformers (ViT) have been of broad interest in recent theoretical\nand empirical works. They are state-of-the-art thanks to their attention-based\napproach, which boosts the identification of key features and patterns within\nimages thanks to the capability of avoiding inductive bias, resulting in highly\naccurate image analysis. Meanwhile, neoteric studies have reported a ``sparse\ndouble descent'' phenomenon that can occur in modern deep-learning models,\nwhere extremely over-parametrized models can generalize well. This raises\npractical questions about the optimal size of the model and the quest over\nfinding the best trade-off between sparsity and performance is launched: are\nVision Transformers also prone to sparse double descent? Can we find a way to\navoid such a phenomenon? Our work tackles the occurrence of sparse double\ndescent on ViTs. Despite some works that have shown that traditional\narchitectures, like Resnet, are condemned to the sparse double descent\nphenomenon, for ViTs we observe that an optimally-tuned $\\ell_2$ regularization\nrelieves such a phenomenon. However, everything comes at a cost: optimal lambda\nwill sacrifice the potential compression of the ViT.\n","authors":["Victor Quétu","Marta Milovanovic","Enzo Tartaglione"],"pdf_url":"https://arxiv.org/pdf/2307.14253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14243v1","updated":"2023-07-26T15:14:10Z","published":"2023-07-26T15:14:10Z","title":"Fluorescent Neuronal Cells v2: Multi-Task, Multi-Format Annotations for\n  Deep Learning in Microscopy","summary":"  Fluorescent Neuronal Cells v2 is a collection of fluorescence microscopy\nimages and the corresponding ground-truth annotations, designed to foster\ninnovative research in the domains of Life Sciences and Deep Learning. This\ndataset encompasses three image collections in which rodent neuronal cells'\nnuclei and cytoplasm are stained with diverse markers to highlight their\nanatomical or functional characteristics. Alongside the images, we provide\nground-truth annotations for several learning tasks, including semantic\nsegmentation, object detection, and counting. The contribution is two-fold.\nFirst, given the variety of annotations and their accessible formats, we\nenvision our work facilitating methodological advancements in computer vision\napproaches for segmentation, detection, feature learning, unsupervised and\nself-supervised learning, transfer learning, and related areas. Second, by\nenabling extensive exploration and benchmarking, we hope Fluorescent Neuronal\nCells v2 will catalyze breakthroughs in fluorescence microscopy analysis and\npromote cutting-edge discoveries in life sciences. The data are available at:\nhttps://amsacta.unibo.it/id/eprint/7347\n","authors":["Luca Clissa","Antonio Macaluso","Roberto Morelli","Alessandra Occhinegro","Emiliana Piscitiello","Ludovico Taddei","Marco Luppi","Roberto Amici","Matteo Cerri","Timna Hitrec","Lorenzo Rinaldi","Antonio Zoccoli"],"pdf_url":"https://arxiv.org/pdf/2307.14243v1.pdf","comment":"11 pages; 5 figures; 2 tables"},{"id":"http://arxiv.org/abs/2307.14242v1","updated":"2023-07-26T15:11:51Z","published":"2023-07-26T15:11:51Z","title":"Defending Adversarial Patches via Joint Region Localizing and Inpainting","summary":"  Deep neural networks are successfully used in various applications, but show\ntheir vulnerability to adversarial examples. With the development of\nadversarial patches, the feasibility of attacks in physical scenes increases,\nand the defenses against patch attacks are urgently needed. However, defending\nsuch adversarial patch attacks is still an unsolved problem. In this paper, we\nanalyse the properties of adversarial patches, and find that: on the one hand,\nadversarial patches will lead to the appearance or contextual inconsistency in\nthe target objects; on the other hand, the patch region will show abnormal\nchanges on the high-level feature maps of the objects extracted by a backbone\nnetwork. Considering the above two points, we propose a novel defense method\nbased on a ``localizing and inpainting\" mechanism to pre-process the input\nexamples. Specifically, we design an unified framework, where the ``localizing\"\nsub-network utilizes a two-branch structure to represent the above two aspects\nto accurately detect the adversarial patch region in the image. For the\n``inpainting\" sub-network, it utilizes the surrounding contextual cues to\nrecover the original content covered by the adversarial patch. The quality of\ninpainted images is also evaluated by measuring the appearance consistency and\nthe effects of adversarial attacks. These two sub-networks are then jointly\ntrained via an iterative optimization manner. In this way, the ``localizing\"\nand ``inpainting\" modules can interact closely with each other, and thus learn\na better solution. A series of experiments versus traffic sign classification\nand detection tasks are conducted to defend against various adversarial patch\nattacks.\n","authors":["Junwen Chen","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2307.14242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14241v1","updated":"2023-07-26T15:10:54Z","published":"2023-07-26T15:10:54Z","title":"DisguisOR: Holistic Face Anonymization for the Operating Room","summary":"  Purpose: Recent advances in Surgical Data Science (SDS) have contributed to\nan increase in video recordings from hospital environments. While methods such\nas surgical workflow recognition show potential in increasing the quality of\npatient care, the quantity of video data has surpassed the scale at which\nimages can be manually anonymized. Existing automated 2D anonymization methods\nunder-perform in Operating Rooms (OR), due to occlusions and obstructions. We\npropose to anonymize multi-view OR recordings using 3D data from multiple\ncamera streams. Methods: RGB and depth images from multiple cameras are fused\ninto a 3D point cloud representation of the scene. We then detect each\nindividual's face in 3D by regressing a parametric human mesh model onto\ndetected 3D human keypoints and aligning the face mesh with the fused 3D point\ncloud. The mesh model is rendered into every acquired camera view, replacing\neach individual's face. Results: Our method shows promise in locating faces at\na higher rate than existing approaches. DisguisOR produces geometrically\nconsistent anonymizations for each camera view, enabling more realistic\nanonymization that is less detrimental to downstream tasks. Conclusion:\nFrequent obstructions and crowding in operating rooms leaves significant room\nfor improvement for off-the-shelf anonymization methods. DisguisOR addresses\nprivacy on a scene level and has the potential to facilitate further research\nin SDS.\n","authors":["Lennart Bastian","Tony Danjun Wang","Tobias Czempiel","Benjamin Busam","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2307.14241v1.pdf","comment":"Accepted at IPCAI 2023; 21 pages, 11 figures"},{"id":"http://arxiv.org/abs/2303.05123v2","updated":"2023-07-26T14:57:53Z","published":"2023-03-09T09:12:21Z","title":"Dominating Set Database Selection for Visual Place Recognition","summary":"  This paper presents an approach for creating a visual place recognition (VPR)\ndatabase for localization in indoor environments from RGBD scanning sequences.\nThe proposed approach is formulated as a minimization problem in terms of\ndominating set algorithm for graph, constructed from spatial information, and\nreferred as DominatingSet. Our algorithm shows better scene coverage in\ncomparison to other methodologies that are used for database creation. Also, we\ndemonstrate that using DominatingSet, a database size could be up to 250-1400\ntimes smaller than the original scanning sequence while maintaining a recall\nrate of more than 80% on testing sequences. We evaluated our algorithm on\n7-scenes and BundleFusion datasets and an additionally recorded sequence in a\nhighly repetitive office setting. In addition, the database selection can\nproduce weakly-supervised labels for fine-tuning neural place recognition\nalgorithms to particular settings, improving even more their accuracy. The\npaper also presents a fully automated pipeline for VPR database creation from\nRGBD scanning sequences, as well as a set of metrics for VPR database\nevaluation. The code and released data are available on our web-page~ --\nhttps://prime-slam.github.io/place-recognition-db/\n","authors":["Anastasiia Kornilova","Ivan Moskalenko","Timofei Pushkin","Fakhriddin Tojiboev","Rahim Tariverdizadeh","Gonzalo Ferrer"],"pdf_url":"https://arxiv.org/pdf/2303.05123v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01415v2","updated":"2023-07-26T14:53:23Z","published":"2023-06-02T10:04:57Z","title":"Learning Landmarks Motion from Speech for Speaker-Agnostic 3D Talking\n  Heads Generation","summary":"  This paper presents a novel approach for generating 3D talking heads from raw\naudio inputs. Our method grounds on the idea that speech related movements can\nbe comprehensively and efficiently described by the motion of a few control\npoints located on the movable parts of the face, i.e., landmarks. The\nunderlying musculoskeletal structure then allows us to learn how their motion\ninfluences the geometrical deformations of the whole face. The proposed method\nemploys two distinct models to this aim: the first one learns to generate the\nmotion of a sparse set of landmarks from the given audio. The second model\nexpands such landmarks motion to a dense motion field, which is utilized to\nanimate a given 3D mesh in neutral state. Additionally, we introduce a novel\nloss function, named Cosine Loss, which minimizes the angle between the\ngenerated motion vectors and the ground truth ones. Using landmarks in 3D\ntalking head generation offers various advantages such as consistency,\nreliability, and obviating the need for manual-annotation. Our approach is\ndesigned to be identity-agnostic, enabling high-quality facial animations for\nany users without additional data or training.\n","authors":["Federico Nocentini","Claudio Ferrari","Stefano Berretti"],"pdf_url":"https://arxiv.org/pdf/2306.01415v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14227v1","updated":"2023-07-26T14:50:01Z","published":"2023-07-26T14:50:01Z","title":"Computational Approaches for Traditional Chinese Painting: From the \"Six\n  Principles of Painting\" Perspective","summary":"  Traditional Chinese Painting (TCP) is an invaluable cultural heritage\nresource and a unique visual art style. In recent years, increasing interest\nhas been placed on digitalizing TCPs to preserve and revive the culture. The\nresulting digital copies have enabled the advancement of computational methods\nfor structured and systematic understanding of TCPs. To explore this topic, we\nconducted an in-depth analysis of 92 pieces of literature. We examined the\ncurrent use of computer technologies on TCPs from three perspectives, based on\nnumerous conversations with specialists. First, in light of the \"Six Principles\nof Painting\" theory, we categorized the articles according to their research\nfocus on artistic elements. Second, we created a four-stage framework to\nillustrate the purposes of TCP applications. Third, we summarized the popular\ncomputational techniques applied to TCPs. The framework also provides insights\ninto potential applications and future prospects, with professional opinion.\nThe list of surveyed publications and related information is available online\nat https://ca4tcp.com.\n","authors":["Wei Zhang","Jian-Wei Zhang","Kam Kwai Wong","Yifang Wang","Yingchaojie Feng","Luwei Wang","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2307.14227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.03906v2","updated":"2023-07-26T14:49:39Z","published":"2021-12-07T18:58:33Z","title":"Cross-modal Manifold Cutmix for Self-supervised Video Representation\n  Learning","summary":"  In this paper, we address the challenge of obtaining large-scale unlabelled\nvideo datasets for contrastive representation learning in real-world\napplications. We present a novel video augmentation technique for\nself-supervised learning, called Cross-Modal Manifold Cutmix (CMMC), which\ngenerates augmented samples by combining different modalities in videos. By\nembedding a video tesseract into another across two modalities in the feature\nspace, our method enhances the quality of learned video representations. We\nperform extensive experiments on two small-scale video datasets, UCF101 and\nHMDB51, for action recognition and video retrieval tasks. Our approach is also\nshown to be effective on the NTU dataset with limited domain knowledge. Our\nCMMC achieves comparable performance to other self-supervised methods while\nusing less training data for both downstream tasks.\n","authors":["Srijan Das","Michael S. Ryoo"],"pdf_url":"https://arxiv.org/pdf/2112.03906v2.pdf","comment":"Accepted at MVA 2023"},{"id":"http://arxiv.org/abs/2203.04317v2","updated":"2023-07-26T13:43:04Z","published":"2022-03-08T18:07:47Z","title":"MICDIR: Multi-scale Inverse-consistent Deformable Image Registration\n  using UNetMSS with Self-Constructing Graph Latent","summary":"  Image registration is the process of bringing different images into a common\ncoordinate system - a technique widely used in various applications of computer\nvision, such as remote sensing, image retrieval, and, most commonly, medical\nimaging. Deep learning based techniques have been applied successfully to\ntackle various complex medical image processing problems, including medical\nimage registration. Over the years, several image registration techniques have\nbeen proposed using deep learning. Deformable image registration techniques\nsuch as Voxelmorph have been successful in capturing finer changes and\nproviding smoother deformations. However, Voxelmorph, as well as ICNet and\nFIRE, do not explicitly encode global dependencies (i.e. the overall anatomical\nview of the supplied image) and, therefore, cannot track large deformations. In\norder to tackle the aforementioned problems, this paper extends the Voxelmorph\napproach in three different ways. To improve the performance in case of small\nas well as large deformations, supervision of the model at different\nresolutions has been integrated using a multi-scale UNet. To support the\nnetwork to learn and encode the minute structural co-relations of the given\nimage-pairs, a self-constructing graph network (SCGNet) has been used as the\nlatent of the multi-scale UNet - which can improve the learning process of the\nmodel and help the model to generalise better. And finally, to make the\ndeformations inverse-consistent, cycle consistency loss has been employed. On\nthe task of registration of brain MRIs, the proposed method achieved\nsignificant improvements over ANTs and VoxelMorph, obtaining a Dice score of\n0.8013 \\pm 0.0243 for intramodal and 0.6211 \\pm 0.0309 for intermodal, while\nVoxelMorph achieved 0.7747 \\pm 0.0260 and 0.6071 \\pm 0.0510, respectively\n","authors":["Soumick Chatterjee","Himanshi Bajaj","Istiyak H. Siddiquee","Nandish Bandi Subbarayappa","Steve Simon","Suraj Bangalore Shashidhar","Oliver Speck","Andreas Nürnberge"],"pdf_url":"https://arxiv.org/pdf/2203.04317v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14187v1","updated":"2023-07-26T13:41:51Z","published":"2023-07-26T13:41:51Z","title":"ADAPT: Efficient Multi-Agent Trajectory Prediction with Adaptation","summary":"  Forecasting future trajectories of agents in complex traffic scenes requires\nreliable and efficient predictions for all agents in the scene. However,\nexisting methods for trajectory prediction are either inefficient or sacrifice\naccuracy. To address this challenge, we propose ADAPT, a novel approach for\njointly predicting the trajectories of all agents in the scene with dynamic\nweight learning. Our approach outperforms state-of-the-art methods in both\nsingle-agent and multi-agent settings on the Argoverse and Interaction\ndatasets, with a fraction of their computational overhead. We attribute the\nimprovement in our performance: first, to the adaptive head augmenting the\nmodel capacity without increasing the model size; second, to our design choices\nin the endpoint-conditioned prediction, reinforced by gradient stopping. Our\nanalyses show that ADAPT can focus on each agent with adaptive prediction,\nallowing for accurate predictions efficiently. https://KUIS-AI.github.io/adapt\n","authors":["Görkay Aydemir","Adil Kaan Akan","Fatma Güney"],"pdf_url":"https://arxiv.org/pdf/2307.14187v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2302.14831v2","updated":"2023-07-26T13:20:49Z","published":"2023-02-28T18:28:35Z","title":"FacEDiM: A Face Embedding Distribution Model for Few-Shot Biometric\n  Authentication of Cattle","summary":"  This work proposes to solve the problem of few-shot biometric authentication\nby computing the Mahalanobis distance between testing embeddings and a\nmultivariate Gaussian distribution of training embeddings obtained using\npre-trained CNNs. Experimental results show that models pre-trained on the\nImageNet dataset significantly outperform models pre-trained on human faces.\nWith a VGG16 model, we obtain a FRR of 1.25% for a FAR of 1.18% on a dataset of\n20 cattle identities.\n","authors":["Meshia Cédric Oveneke","Rucha Vaishampayan","Deogratias Lukamba Nsadisa","Jenny Ambukiyenyi Onya"],"pdf_url":"https://arxiv.org/pdf/2302.14831v2.pdf","comment":"4 pages, 1 figure, 1 table, paper accepted at Black In AI at the 36th\n  Conference on Neural Information Processing Systems (NeurIPS 2022), New\n  Orleans, USA"},{"id":"http://arxiv.org/abs/2307.14179v1","updated":"2023-07-26T13:11:48Z","published":"2023-07-26T13:11:48Z","title":"Resolution-Aware Design of Atrous Rates for Semantic Segmentation\n  Networks","summary":"  DeepLab is a widely used deep neural network for semantic segmentation, whose\nsuccess is attributed to its parallel architecture called atrous spatial\npyramid pooling (ASPP). ASPP uses multiple atrous convolutions with different\natrous rates to extract both local and global information. However, fixed\nvalues of atrous rates are used for the ASPP module, which restricts the size\nof its field of view. In principle, atrous rate should be a hyperparameter to\nchange the field of view size according to the target task or dataset. However,\nthe manipulation of atrous rate is not governed by any guidelines. This study\nproposes practical guidelines for obtaining an optimal atrous rate. First, an\neffective receptive field for semantic segmentation is introduced to analyze\nthe inner behavior of segmentation networks. We observed that the use of ASPP\nmodule yielded a specific pattern in the effective receptive field, which was\ntraced to reveal the module's underlying mechanism. Accordingly, we derive\npractical guidelines for obtaining the optimal atrous rate, which should be\ncontrolled based on the size of input image. Compared to other values, using\nthe optimal atrous rate consistently improved the segmentation results across\nmultiple datasets, including the STARE, CHASE_DB1, HRF, Cityscapes, and iSAID\ndatasets.\n","authors":["Bum Jun Kim","Hyeyeon Choi","Hyeonah Jang","Sang Woo Kim"],"pdf_url":"https://arxiv.org/pdf/2307.14179v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2307.04956v2","updated":"2023-07-26T13:11:41Z","published":"2023-07-11T01:17:00Z","title":"PKU-GoodsAD: A Supermarket Goods Dataset for Unsupervised Anomaly\n  Detection and Segmentation","summary":"  Visual anomaly detection is essential and commonly used for many tasks in the\nfield of computer vision. Recent anomaly detection datasets mainly focus on\nindustrial automated inspection, medical image analysis and video surveillance.\nIn order to broaden the application and research of anomaly detection in\nunmanned supermarkets and smart manufacturing, we introduce the supermarket\ngoods anomaly detection (GoodsAD) dataset. It contains 6124 high-resolution\nimages of 484 different appearance goods divided into 6 categories. Each\ncategory contains several common different types of anomalies such as\ndeformation, surface damage and opened. Anomalies contain both texture changes\nand structural changes. It follows the unsupervised setting and only normal\n(defect-free) images are used for training. Pixel-precise ground truth regions\nare provided for all anomalies. Moreover, we also conduct a thorough evaluation\nof current state-of-the-art unsupervised anomaly detection methods. This\ninitial benchmark indicates that some methods which perform well on the\nindustrial anomaly detection dataset (e.g., MVTec AD), show poor performance on\nour dataset. This is a comprehensive, multi-object dataset for supermarket\ngoods anomaly detection that focuses on real-world applications.\n","authors":["Jian Zhang","Runwei Ding","Miaoju Ban","Ge Yang"],"pdf_url":"https://arxiv.org/pdf/2307.04956v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.12721v2","updated":"2023-07-26T13:07:32Z","published":"2023-07-24T12:03:50Z","title":"AMAE: Adaptation of Pre-Trained Masked Autoencoder for Dual-Distribution\n  Anomaly Detection in Chest X-Rays","summary":"  Unsupervised anomaly detection in medical images such as chest radiographs is\nstepping into the spotlight as it mitigates the scarcity of the labor-intensive\nand costly expert annotation of anomaly data. However, nearly all existing\nmethods are formulated as a one-class classification trained only on\nrepresentations from the normal class and discard a potentially significant\nportion of the unlabeled data. This paper focuses on a more practical setting,\ndual distribution anomaly detection for chest X-rays, using the entire training\ndata, including both normal and unlabeled images. Inspired by a modern\nself-supervised vision transformer model trained using partial image inputs to\nreconstruct missing image regions -- we propose AMAE, a two-stage algorithm for\nadaptation of the pre-trained masked autoencoder (MAE). Starting from MAE\ninitialization, AMAE first creates synthetic anomalies from only normal\ntraining images and trains a lightweight classifier on frozen transformer\nfeatures. Subsequently, we propose an adaptation strategy to leverage unlabeled\nimages containing anomalies. The adaptation scheme is accomplished by assigning\npseudo-labels to unlabeled images and using two separate MAE based modules to\nmodel the normative and anomalous distributions of pseudo-labeled images. The\neffectiveness of the proposed adaptation strategy is evaluated with different\nanomaly ratios in an unlabeled training set. AMAE leads to consistent\nperformance gains over competing self-supervised and dual distribution anomaly\ndetection methods, setting the new state-of-the-art on three public chest X-ray\nbenchmarks: RSNA, NIH-CXR, and VinDr-CXR.\n","authors":["Behzad Bozorgtabar","Dwarikanath Mahapatra","Jean-Philippe Thiran"],"pdf_url":"https://arxiv.org/pdf/2307.12721v2.pdf","comment":"To be presented at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.14177v1","updated":"2023-07-26T13:06:35Z","published":"2023-07-26T13:06:35Z","title":"High-definition event frame generation using SoC FPGA devices","summary":"  In this paper we have addressed the implementation of the accumulation and\nprojection of high-resolution event data stream (HD -1280 x 720 pixels) onto\nthe image plane in FPGA devices. The results confirm the feasibility of this\napproach, but there are a number of challenges, limitations and trade-offs to\nbe considered. The required hardware resources of selected data\nrepresentations, such as binary frame, event frame, exponentially decaying time\nsurface and event frequency, were compared with those available on several\npopular platforms from AMD Xilinx. The resulting event frames can be used for\ntypical vision algorithms, such as object classification and detection, using\nboth classical and deep neural network methods.\n","authors":["Krzysztof Blachut","Tomasz Kryjak"],"pdf_url":"https://arxiv.org/pdf/2307.14177v1.pdf","comment":"Paper accepted for the SPA 2023 conference"},{"id":"http://arxiv.org/abs/2305.11467v3","updated":"2023-07-26T12:58:52Z","published":"2023-05-19T06:39:10Z","title":"Learning Sequence Descriptor based on Spatio-Temporal Attention for\n  Visual Place Recognition","summary":"  Visual Place Recognition (VPR) aims to retrieve frames from a geotagged\ndatabase that are located at the same place as the query frame. To improve the\nrobustness of VPR in perceptually aliasing scenarios, sequence-based VPR\nmethods are proposed. These methods are either based on matching between frame\nsequences or extracting sequence descriptors for direct retrieval. However, the\nformer is usually based on the assumption of constant velocity, which is\ndifficult to hold in practice, and is computationally expensive and subject to\nsequence length. Although the latter overcomes these problems, existing\nsequence descriptors are constructed by aggregating features of multiple frames\nonly, without interaction on temporal information, and thus cannot obtain\ndescriptors with spatio-temporal discrimination. In this paper, we propose a\nsequence descriptor that effectively incorporates spatio-temporal information.\nSpecifically, spatial attention within the same frame is utilized to learn\nspatial feature patterns, while attention in corresponding local regions of\ndifferent frames is utilized to learn the persistence or change of features\nover time. We use a sliding window to control the temporal range of attention\nand use relative position encoding to construct sequential relationships\nbetween different features. This allows our descriptors to capture the\nintrinsic dynamics in a sequence of frames. Comprehensive experiments on\nchallenging benchmark datasets show that the proposed approach outperforms\nrecent state-of-the-art methods.\n","authors":["Fenglin Zhang","Junqiao Zhao","Yingfeng Cai","Gengxuan Tian","Wenjie Mu","Chen Ye"],"pdf_url":"https://arxiv.org/pdf/2305.11467v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14142v1","updated":"2023-07-26T12:13:00Z","published":"2023-07-26T12:13:00Z","title":"LOIS: Looking Out of Instance Semantics for Visual Question Answering","summary":"  Visual question answering (VQA) has been intensively studied as a multimodal\ntask that requires effort in bridging vision and language to infer answers\ncorrectly. Recent attempts have developed various attention-based modules for\nsolving VQA tasks. However, the performance of model inference is largely\nbottlenecked by visual processing for semantics understanding. Most existing\ndetection methods rely on bounding boxes, remaining a serious challenge for VQA\nmodels to understand the causal nexus of object semantics in images and\ncorrectly infer contextual information. To this end, we propose a finer model\nframework without bounding boxes in this work, termed Looking Out of Instance\nSemantics (LOIS) to tackle this important issue. LOIS enables more fine-grained\nfeature descriptions to produce visual facts. Furthermore, to overcome the\nlabel ambiguity caused by instance masks, two types of relation attention\nmodules: 1) intra-modality and 2) inter-modality, are devised to infer the\ncorrect answers from the different multi-view features. Specifically, we\nimplement a mutual relation attention module to model sophisticated and deeper\nvisual semantic relations between instance objects and background information.\nIn addition, our proposed attention model can further analyze salient image\nregions by focusing on important word-related questions. Experimental results\non four benchmark VQA datasets prove that our proposed method has favorable\nperformance in improving visual reasoning capability.\n","authors":["Siyu Zhang","Yeming Chen","Yaoru Sun","Fang Wang","Haibo Shi","Haoran Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05511v2","updated":"2023-07-26T11:59:11Z","published":"2023-05-09T14:58:13Z","title":"Self-supervised dense representation learning for live-cell microscopy\n  with time arrow prediction","summary":"  State-of-the-art object detection and segmentation methods for microscopy\nimages rely on supervised machine learning, which requires laborious manual\nannotation of training data. Here we present a self-supervised method based on\ntime arrow prediction pre-training that learns dense image representations from\nraw, unlabeled live-cell microscopy videos. Our method builds upon the task of\npredicting the correct order of time-flipped image regions via a single-image\nfeature extractor followed by a time arrow prediction head that operates on the\nfused features. We show that the resulting dense representations capture\ninherently time-asymmetric biological processes such as cell divisions on a\npixel-level. We furthermore demonstrate the utility of these representations on\nseveral live-cell microscopy datasets for detection and segmentation of\ndividing cells, as well as for cell state classification. Our method\noutperforms supervised methods, particularly when only limited ground truth\nannotations are available as is commonly the case in practice. We provide code\nat https://github.com/weigertlab/tarrow.\n","authors":["Benjamin Gallusser","Max Stieber","Martin Weigert"],"pdf_url":"https://arxiv.org/pdf/2305.05511v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09630v4","updated":"2023-07-26T11:55:03Z","published":"2023-04-19T13:05:18Z","title":"Few-shot Medical Image Segmentation via Cross-Reference Transformer","summary":"  Deep learning models have become the mainstream method for medical image\nsegmentation, but they require a large manually labeled dataset for training\nand are difficult to extend to unseen categories. Few-shot segmentation(FSS)\nhas the potential to address these challenges by learning new categories from a\nsmall number of labeled samples. The majority of the current methods employ a\nprototype learning architecture, which involves expanding support prototype\nvectors and concatenating them with query features to conduct conditional\nsegmentation. However, such framework potentially focuses more on query\nfeatures while may neglect the correlation between support and query features.\nIn this paper, we propose a novel self-supervised few shot medical image\nsegmentation network with Cross-Reference Transformer, which addresses the lack\nof interaction between the support image and the query image. We first enhance\nthe correlation features between the support set image and the query image\nusing a bidirectional cross-attention module. Then, we employ a cross-reference\nmechanism to mine and enhance the similar parts of support features and query\nfeatures in high-dimensional channels. Experimental results show that the\nproposed model achieves good results on both CT dataset and MRI dataset.\n","authors":["Yao Huang","Jianming Liu"],"pdf_url":"https://arxiv.org/pdf/2304.09630v4.pdf","comment":"6 pages,4 figures"},{"id":"http://arxiv.org/abs/2307.14127v1","updated":"2023-07-26T11:47:44Z","published":"2023-07-26T11:47:44Z","title":"Creative Birds: Self-Supervised Single-View 3D Style Transfer","summary":"  In this paper, we propose a novel method for single-view 3D style transfer\nthat generates a unique 3D object with both shape and texture transfer. Our\nfocus lies primarily on birds, a popular subject in 3D reconstruction, for\nwhich no existing single-view 3D transfer methods have been developed.The\nmethod we propose seeks to generate a 3D mesh shape and texture of a bird from\ntwo single-view images. To achieve this, we introduce a novel shape transfer\ngenerator that comprises a dual residual gated network (DRGNet), and a\nmulti-layer perceptron (MLP). DRGNet extracts the features of source and target\nimages using a shared coordinate gate unit, while the MLP generates spatial\ncoordinates for building a 3D mesh. We also introduce a semantic UV texture\ntransfer module that implements textural style transfer using semantic UV\nsegmentation, which ensures consistency in the semantic meaning of the\ntransferred regions. This module can be widely adapted to many existing\napproaches. Finally, our method constructs a novel 3D bird using a\ndifferentiable renderer. Experimental results on the CUB dataset verify that\nour method achieves state-of-the-art performance on the single-view 3D style\ntransfer task. Code is available in\nhttps://github.com/wrk226/2D-to-3D-Evolution-Transfer.\n","authors":["Renke Wang","Guimin Que","Shuo Chen","Xiang Li","Jun Li","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2307.14127v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14126v1","updated":"2023-07-26T11:45:39Z","published":"2023-07-26T11:45:39Z","title":"Multi-modal Learning with Missing Modality via Shared-Specific Feature\n  Modelling","summary":"  The missing modality issue is critical but non-trivial to be solved by\nmulti-modal models. Current methods aiming to handle the missing modality\nproblem in multi-modal tasks, either deal with missing modalities only during\nevaluation or train separate models to handle specific missing modality\nsettings. In addition, these models are designed for specific tasks, so for\nexample, classification models are not easily adapted to segmentation tasks and\nvice versa. In this paper, we propose the Shared-Specific Feature Modelling\n(ShaSpec) method that is considerably simpler and more effective than competing\napproaches that address the issues above. ShaSpec is designed to take advantage\nof all available input modalities during training and evaluation by learning\nshared and specific features to better represent the input data. This is\nachieved from a strategy that relies on auxiliary tasks based on distribution\nalignment and domain classification, in addition to a residual feature fusion\nprocedure. Also, the design simplicity of ShaSpec enables its easy adaptation\nto multiple tasks, such as classification and segmentation. Experiments are\nconducted on both medical image segmentation and computer vision\nclassification, with results indicating that ShaSpec outperforms competing\nmethods by a large margin. For instance, on BraTS2018, ShaSpec improves the\nSOTA by more than 3% for enhancing tumour, 5% for tumour core and 3% for whole\ntumour.\n","authors":["Hu Wang","Yuanhong Chen","Congbo Ma","Jodie Avery","Louise Hull","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2307.14126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14124v1","updated":"2023-07-26T11:44:44Z","published":"2023-07-26T11:44:44Z","title":"Memory-Efficient Graph Convolutional Networks for Object Classification\n  and Detection with Event Cameras","summary":"  Recent advances in event camera research emphasize processing data in its\noriginal sparse form, which allows the use of its unique features such as high\ntemporal resolution, high dynamic range, low latency, and resistance to image\nblur. One promising approach for analyzing event data is through graph\nconvolutional networks (GCNs). However, current research in this domain\nprimarily focuses on optimizing computational costs, neglecting the associated\nmemory costs. In this paper, we consider both factors together in order to\nachieve satisfying results and relatively low model complexity. For this\npurpose, we performed a comparative analysis of different graph convolution\noperations, considering factors such as execution time, the number of trainable\nmodel parameters, data format requirements, and training outcomes. Our results\nshow a 450-fold reduction in the number of parameters for the feature\nextraction module and a 4.5-fold reduction in the size of the data\nrepresentation while maintaining a classification accuracy of 52.3%, which is\n6.3% higher compared to the operation used in state-of-the-art approaches. To\nfurther evaluate performance, we implemented the object detection architecture\nand evaluated its performance on the N-Caltech101 dataset. The results showed\nan accuracy of 53.7 % mAP@0.5 and reached an execution rate of 82 graphs per\nsecond.\n","authors":["Kamil Jeziorek","Andrea Pinna","Tomasz Kryjak"],"pdf_url":"https://arxiv.org/pdf/2307.14124v1.pdf","comment":"Accepted for the SPA 2023 conference"},{"id":"http://arxiv.org/abs/2302.01226v2","updated":"2023-07-26T11:40:03Z","published":"2023-02-02T17:06:50Z","title":"Factor Fields: A Unified Framework for Neural Fields and Beyond","summary":"  We present Factor Fields, a novel framework for modeling and representing\nsignals. Factor Fields decomposes a signal into a product of factors, each of\nwhich is represented by a neural or regular field representation operating on a\ncoordinate transformed input signal. We show that this decomposition yields a\nunified framework that generalizes several recent signal representations\nincluding NeRF, PlenOxels, EG3D, Instant-NGP, and TensoRF. Moreover, the\nframework allows for the creation of powerful new signal representations, such\nas the Coefficient-Basis Factorization (CoBaFa) which we propose in this paper.\nAs evidenced by our experiments, CoBaFa leads to improvements over previous\nfast reconstruction methods in terms of the three critical goals in neural\nsignal representation: approximation quality, compactness and efficiency.\nExperimentally, we demonstrate that our representation achieves better image\napproximation quality on 2D image regression tasks, higher geometric quality\nwhen reconstructing 3D signed distance fields and higher compactness for\nradiance field reconstruction tasks compared to previous fast reconstruction\nmethods. Besides, our CoBaFa representation enables generalization by sharing\nthe basis across signals during training, enabling generalization tasks such as\nimage regression with sparse observations and few-shot radiance field\nreconstruction. Project Page: https://apchenstu.github.io/FactorFields/\n","authors":["Anpei Chen","Zexiang Xu","Xinyue Wei","Siyu Tang","Hao Su","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2302.01226v2.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.14119v1","updated":"2023-07-26T11:38:45Z","published":"2023-07-26T11:38:45Z","title":"A semantics-driven methodology for high-quality image annotation","summary":"  Recent work in Machine Learning and Computer Vision has highlighted the\npresence of various types of systematic flaws inside ground truth object\nrecognition benchmark datasets. Our basic tenet is that these flaws are rooted\nin the many-to-many mappings which exist between the visual information encoded\nin images and the intended semantics of the labels annotating them. The net\nconsequence is that the current annotation process is largely under-specified,\nthus leaving too much freedom to the subjective judgment of annotators. In this\npaper, we propose vTelos, an integrated Natural Language Processing, Knowledge\nRepresentation, and Computer Vision methodology whose main goal is to make\nexplicit the (otherwise implicit) intended annotation semantics, thus\nminimizing the number and role of subjective choices. A key element of vTelos\nis the exploitation of the WordNet lexico-semantic hierarchy as the main means\nfor providing the meaning of natural language labels and, as a consequence, for\ndriving the annotation of images based on the objects and the visual properties\nthey depict. The methodology is validated on images populating a subset of the\nImageNet hierarchy.\n","authors":["Fausto Giunchiglia","Mayukh Bagchi","Xiaolei Diao"],"pdf_url":"https://arxiv.org/pdf/2307.14119v1.pdf","comment":"Accepted @ 26th European Conference on Artificial Intelligence (ECAI)\n  2023, Krak\\'ow, Poland"},{"id":"http://arxiv.org/abs/2307.14111v1","updated":"2023-07-26T11:14:36Z","published":"2023-07-26T11:14:36Z","title":"Periocular biometrics: databases, algorithms and directions","summary":"  Periocular biometrics has been established as an independent modality due to\nconcerns on the performance of iris or face systems in uncontrolled conditions.\nPeriocular refers to the facial region in the eye vicinity, including eyelids,\nlashes and eyebrows. It is available over a wide range of acquisition\ndistances, representing a trade-off between the whole face (which can be\noccluded at close distances) and the iris texture (which do not have enough\nresolution at long distances). Since the periocular region appears in face or\niris images, it can be used also in conjunction with these modalities. Features\nextracted from the periocular region have been also used successfully for\ngender classification and ethnicity classification, and to study the impact of\ngender transformation or plastic surgery in the recognition performance. This\npaper presents a review of the state of the art in periocular biometric\nresearch, providing an insight of the most relevant issues and giving a\nthorough coverage of the existing literature. Future research trends are also\nbriefly discussed.\n","authors":["Fernando Alonso-Fernandez","Josef Bigun"],"pdf_url":"https://arxiv.org/pdf/2307.14111v1.pdf","comment":"Published in: 2016 4th International Conference on Biometrics and\n  Forensics (IWBF). arXiv admin note: substantial text overlap with\n  arXiv:1810.03360"},{"id":"http://arxiv.org/abs/2303.17595v3","updated":"2023-07-26T11:06:32Z","published":"2023-03-30T17:59:02Z","title":"Neglected Free Lunch -- Learning Image Classifiers Using Annotation\n  Byproducts","summary":"  Supervised learning of image classifiers distills human knowledge into a\nparametric model through pairs of images and corresponding labels (X,Y). We\nargue that this simple and widely used representation of human knowledge\nneglects rich auxiliary information from the annotation procedure, such as the\ntime-series of mouse traces and clicks left after image selection. Our insight\nis that such annotation byproducts Z provide approximate human attention that\nweakly guides the model to focus on the foreground cues, reducing spurious\ncorrelations and discouraging shortcut learning. To verify this, we create\nImageNet-AB and COCO-AB. They are ImageNet and COCO training sets enriched with\nsample-wise annotation byproducts, collected by replicating the respective\noriginal annotation tasks. We refer to the new paradigm of training models with\nannotation byproducts as learning using annotation byproducts (LUAB). We show\nthat a simple multitask loss for regressing Z together with Y already improves\nthe generalisability and robustness of the learned models. Compared to the\noriginal supervised learning, LUAB does not require extra annotation costs.\nImageNet-AB and COCO-AB are at https://github.com/naver-ai/NeglectedFreeLunch.\n","authors":["Dongyoon Han","Junsuk Choe","Seonghyeok Chun","John Joon Young Chung","Minsuk Chang","Sangdoo Yun","Jean Y. Song","Seong Joon Oh"],"pdf_url":"https://arxiv.org/pdf/2303.17595v3.pdf","comment":"Code & data at https://github.com/naver-ai/NeglectedFreeLunch. To be\n  presented at ICCV'23"},{"id":"http://arxiv.org/abs/2307.14073v1","updated":"2023-07-26T09:50:44Z","published":"2023-07-26T09:50:44Z","title":"VideoControlNet: A Motion-Guided Video-to-Video Translation Framework by\n  Using Diffusion Model with ControlNet","summary":"  Recently, diffusion models like StableDiffusion have achieved impressive\nimage generation results. However, the generation process of such diffusion\nmodels is uncontrollable, which makes it hard to generate videos with\ncontinuous and consistent content. In this work, by using the diffusion model\nwith ControlNet, we proposed a new motion-guided video-to-video translation\nframework called VideoControlNet to generate various videos based on the given\nprompts and the condition from the input video. Inspired by the video codecs\nthat use motion information for reducing temporal redundancy, our framework\nuses motion information to prevent the regeneration of the redundant areas for\ncontent consistency. Specifically, we generate the first frame (i.e., the\nI-frame) by using the diffusion model with ControlNet. Then we generate other\nkey frames (i.e., the P-frame) based on the previous I/P-frame by using our\nnewly proposed motion-guided P-frame generation (MgPG) method, in which the\nP-frames are generated based on the motion information and the occlusion areas\nare inpainted by using the diffusion model. Finally, the rest frames (i.e., the\nB-frame) are generated by using our motion-guided B-frame interpolation (MgBI)\nmodule. Our experiments demonstrate that our proposed VideoControlNet inherits\nthe generation capability of the pre-trained large diffusion model and extends\nthe image diffusion model to the video diffusion model by using motion\ninformation. More results are provided at our project page.\n","authors":["Zhihao Hu","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2307.14073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14071v1","updated":"2023-07-26T09:47:37Z","published":"2023-07-26T09:47:37Z","title":"Uncertainty Guided Adaptive Warping for Robust and Efficient Stereo\n  Matching","summary":"  Correlation based stereo matching has achieved outstanding performance, which\npursues cost volume between two feature maps. Unfortunately, current methods\nwith a fixed model do not work uniformly well across various datasets, greatly\nlimiting their real-world applicability. To tackle this issue, this paper\nproposes a new perspective to dynamically calculate correlation for robust\nstereo matching. A novel Uncertainty Guided Adaptive Correlation (UGAC) module\nis introduced to robustly adapt the same model for different scenarios.\nSpecifically, a variance-based uncertainty estimation is employed to adaptively\nadjust the sampling area during warping operation. Additionally, we improve the\ntraditional non-parametric warping with learnable parameters, such that the\nposition-specific weights can be learned. We show that by empowering the\nrecurrent network with the UGAC module, stereo matching can be exploited more\nrobustly and effectively. Extensive experiments demonstrate that our method\nachieves state-of-the-art performance over the ETH3D, KITTI, and Middlebury\ndatasets when employing the same fixed model over these datasets without any\nretraining procedure. To target real-time applications, we further design a\nlightweight model based on UGAC, which also outperforms other methods over\nKITTI benchmarks with only 0.6 M parameters.\n","authors":["Junpeng Jing","Jiankun Li","Pengfei Xiong","Jiangyu Liu","Shuaicheng Liu","Yichen Guo","Xin Deng","Mai Xu","Lai Jiang","Leonid Sigal"],"pdf_url":"https://arxiv.org/pdf/2307.14071v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.14070v1","updated":"2023-07-26T09:45:17Z","published":"2023-07-26T09:45:17Z","title":"PNT-Edge: Towards Robust Edge Detection with Noisy Labels by Learning\n  Pixel-level Noise Transitions","summary":"  Relying on large-scale training data with pixel-level labels, previous edge\ndetection methods have achieved high performance. However, it is hard to\nmanually label edges accurately, especially for large datasets, and thus the\ndatasets inevitably contain noisy labels. This label-noise issue has been\nstudied extensively for classification, while still remaining under-explored\nfor edge detection. To address the label-noise issue for edge detection, this\npaper proposes to learn Pixel-level NoiseTransitions to model the\nlabel-corruption process. To achieve it, we develop a novel Pixel-wise Shift\nLearning (PSL) module to estimate the transition from clean to noisy labels as\na displacement field. Exploiting the estimated noise transitions, our model,\nnamed PNT-Edge, is able to fit the prediction to clean labels. In addition, a\nlocal edge density regularization term is devised to exploit local structure\ninformation for better transition learning. This term encourages learning large\nshifts for the edges with complex local structures. Experiments on SBD and\nCityscapes demonstrate the effectiveness of our method in relieving the impact\nof label noise. Codes will be available at github.\n","authors":["Wenjie Xuan","Shanshan Zhao","Yu Yao","Juhua Liu","Tongliang Liu","Yixin Chen","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2307.14070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08000v2","updated":"2023-07-26T09:43:15Z","published":"2023-05-13T20:45:17Z","title":"DNN-Compressed Domain Visual Recognition with Feature Adaptation","summary":"  Learning-based image compression was shown to achieve a competitive\nperformance with state-of-the-art transform-based codecs. This motivated the\ndevelopment of new learning-based visual compression standards such as JPEG-AI.\nOf particular interest to these emerging standards is the development of\nlearning-based image compression systems targeting both humans and machines.\nThis paper is concerned with learning-based compression schemes whose\ncompressed-domain representations can be utilized to perform visual processing\nand computer vision tasks directly in the compressed domain. In our work, we\nadopt a learning-based compressed-domain classification framework for\nperforming visual recognition using the compressed-domain latent representation\nat varying bit-rates. We propose a novel feature adaptation module integrating\na lightweight attention model to adaptively emphasize and enhance the key\nfeatures within the extracted channel-wise information. Also, we design an\nadaptation training strategy to utilize the pretrained pixel-domain weights.\nFor comparison, in addition to the performance results that are obtained using\nour proposed latent-based compressed-domain method, we also present performance\nresults using compressed but fully decoded images in the pixel domain as well\nas original uncompressed images. The obtained performance results show that our\nproposed compressed-domain classification model can distinctly outperform the\nexisting compressed-domain classification models, and that it can also yield\nsimilar accuracy results with a much higher computational efficiency as\ncompared to the pixel-domain models that are trained using fully decoded\nimages.\n","authors":["Yingpeng Deng","Lina J. Karam"],"pdf_url":"https://arxiv.org/pdf/2305.08000v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14066v1","updated":"2023-07-26T09:33:24Z","published":"2023-07-26T09:33:24Z","title":"Pre-Training with Diffusion models for Dental Radiography segmentation","summary":"  Medical radiography segmentation, and specifically dental radiography, is\nhighly limited by the cost of labeling which requires specific expertise and\nlabor-intensive annotations. In this work, we propose a straightforward\npre-training method for semantic segmentation leveraging Denoising Diffusion\nProbabilistic Models (DDPM), which have shown impressive results for generative\nmodeling. Our straightforward approach achieves remarkable performance in terms\nof label efficiency and does not require architectural modifications between\npre-training and downstream tasks. We propose to first pre-train a Unet by\nexploiting the DDPM training objective, and then fine-tune the resulting model\non a segmentation task. Our experimental results on the segmentation of dental\nradiographs demonstrate that the proposed method is competitive with\nstate-of-the-art pre-training methods.\n","authors":["Jérémy Rousseau","Christian Alaka","Emma Covili","Hippolyte Mayard","Laura Misrachi","Willy Au"],"pdf_url":"https://arxiv.org/pdf/2307.14066v1.pdf","comment":"13 pages, 6 figures, Deep Generative Models workshop @ MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.14063v1","updated":"2023-07-26T09:31:06Z","published":"2023-07-26T09:31:06Z","title":"ECO: Ensembling Context Optimization for Vision-Language Models","summary":"  Image recognition has recently witnessed a paradigm shift, where\nvision-language models are now used to perform few-shot classification based on\ntextual prompts. Among these, the CLIP model has shown remarkable capabilities\nfor zero-shot transfer by matching an image and a custom textual prompt in its\nlatent space. This has paved the way for several works that focus on\nengineering or learning textual contexts for maximizing CLIP's classification\ncapabilities. In this paper, we follow this trend by learning an ensemble of\nprompts for image classification. We show that learning diverse and possibly\nshorter contexts improves considerably and consistently the results rather than\nrelying on a single trainable prompt. In particular, we report better few-shot\ncapabilities with no additional cost at inference time. We demonstrate the\ncapabilities of our approach on 11 different benchmarks.\n","authors":["Lorenzo Agnolucci","Alberto Baldrati","Francesco Todino","Federico Becattini","Marco Bertini","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2307.14063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18120v2","updated":"2023-07-26T09:19:29Z","published":"2023-05-29T14:31:54Z","title":"TD-GEM: Text-Driven Garment Editing Mapper","summary":"  Language-based fashion image editing allows users to try out variations of\ndesired garments through provided text prompts. Inspired by research on\nmanipulating latent representations in StyleCLIP and HairCLIP, we focus on\nthese latent spaces for editing fashion items of full-body human datasets.\nCurrently, there is a gap in handling fashion image editing due to the\ncomplexity of garment shapes and textures and the diversity of human poses. In\nthis paper, we propose an editing optimizer scheme method called Text-Driven\nGarment Editing Mapper (TD-GEM), aiming to edit fashion items in a disentangled\nway. To this end, we initially obtain a latent representation of an image\nthrough generative adversarial network inversions such as Encoder for Editing\n(e4e) or Pivotal Tuning Inversion (PTI) for more accurate results. An\noptimization-based Contrastive Language-Image Pre-training (CLIP) is then\nutilized to guide the latent representation of a fashion image in the direction\nof a target attribute expressed in terms of a text prompt. Our TD-GEM\nmanipulates the image accurately according to the target attribute, while other\nparts of the image are kept untouched. In the experiments, we evaluate TD-GEM\non two different attributes (i.e., \"color\" and \"sleeve length\"), which\neffectively generates realistic images compared to the recent manipulation\nschemes.\n","authors":["Reza Dadfar","Sanaz Sabzevari","Mårten Björkman","Danica Kragic"],"pdf_url":"https://arxiv.org/pdf/2305.18120v2.pdf","comment":"The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2307.14061v1","updated":"2023-07-26T09:19:21Z","published":"2023-07-26T09:19:21Z","title":"Set-level Guidance Attack: Boosting Adversarial Transferability of\n  Vision-Language Pre-training Models","summary":"  Vision-language pre-training (VLP) models have shown vulnerability to\nadversarial examples in multimodal tasks. Furthermore, malicious adversaries\ncan be deliberately transferred to attack other black-box models. However,\nexisting work has mainly focused on investigating white-box attacks. In this\npaper, we present the first study to investigate the adversarial\ntransferability of recent VLP models. We observe that existing methods exhibit\nmuch lower transferability, compared to the strong attack performance in\nwhite-box settings. The transferability degradation is partly caused by the\nunder-utilization of cross-modal interactions. Particularly, unlike unimodal\nlearning, VLP models rely heavily on cross-modal interactions and the\nmultimodal alignments are many-to-many, e.g., an image can be described in\nvarious natural languages. To this end, we propose a highly transferable\nSet-level Guidance Attack (SGA) that thoroughly leverages modality interactions\nand incorporates alignment-preserving augmentation with cross-modal guidance.\nExperimental results demonstrate that SGA could generate adversarial examples\nthat can strongly transfer across different VLP models on multiple downstream\nvision-language tasks. On image-text retrieval, SGA significantly enhances the\nattack success rate for transfer attacks from ALBEF to TCL by a large margin\n(at least 9.78% and up to 30.21%), compared to the state-of-the-art.\n","authors":["Dong Lu","Zhiqiang Wang","Teng Wang","Weili Guan","Hongchang Gao","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.14061v1.pdf","comment":"To appear in ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14058v1","updated":"2023-07-26T09:12:05Z","published":"2023-07-26T09:12:05Z","title":"Towards Establishing Systematic Classification Requirements for\n  Automated Driving","summary":"  Despite the presence of the classification task in many different benchmark\ndatasets for perception in the automotive domain, few efforts have been\nundertaken to define consistent classification requirements. This work\naddresses the topic by proposing a structured method to generate a\nclassification structure. First, legal categories are identified based on\nbehavioral requirements for the vehicle. This structure is further\nsubstantiated by considering the two aspects of collision safety for objects as\nwell as perceptual categories. A classification hierarchy is obtained by\napplying the method to an exemplary legal text. A comparison of the results\nwith benchmark dataset categories shows limited agreement. This indicates the\nnecessity for explicit consideration of legal requirements regarding\nperception.\n","authors":["Ken T. Mori","Trent Brown","Steven Peters"],"pdf_url":"https://arxiv.org/pdf/2307.14058v1.pdf","comment":"Accepted to IEEE IV 2023"},{"id":"http://arxiv.org/abs/2307.14052v1","updated":"2023-07-26T09:04:35Z","published":"2023-07-26T09:04:35Z","title":"Unite-Divide-Unite: Joint Boosting Trunk and Structure for High-accuracy\n  Dichotomous Image Segmentation","summary":"  High-accuracy Dichotomous Image Segmentation (DIS) aims to pinpoint\ncategory-agnostic foreground objects from natural scenes. The main challenge\nfor DIS involves identifying the highly accurate dominant area while rendering\ndetailed object structure. However, directly using a general encoder-decoder\narchitecture may result in an oversupply of high-level features and neglect the\nshallow spatial information necessary for partitioning meticulous structures.\nTo fill this gap, we introduce a novel Unite-Divide-Unite Network (UDUN} that\nrestructures and bipartitely arranges complementary features to simultaneously\nboost the effectiveness of trunk and structure identification. The proposed\nUDUN proceeds from several strengths. First, a dual-size input feeds into the\nshared backbone to produce more holistic and detailed features while keeping\nthe model lightweight. Second, a simple Divide-and-Conquer Module (DCM) is\nproposed to decouple multiscale low- and high-level features into our structure\ndecoder and trunk decoder to obtain structure and trunk information\nrespectively. Moreover, we design a Trunk-Structure Aggregation module (TSA) in\nour union decoder that performs cascade integration for uniform high-accuracy\nsegmentation. As a result, UDUN performs favorably against state-of-the-art\ncompetitors in all six evaluation metrics on overall DIS-TE, i.e., achieving\n0.772 weighted F-measure and 977 HCE. Using 1024*1024 input, our model enables\nreal-time inference at 65.3 fps with ResNet-18.\n","authors":["Jialun Pei","Zhangjun Zhou","Yueming Jin","He Tang","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2307.14052v1.pdf","comment":"This paper has been accepted by ACM MM2023"},{"id":"http://arxiv.org/abs/2307.14051v1","updated":"2023-07-26T09:04:27Z","published":"2023-07-26T09:04:27Z","title":"3D Semantic Subspace Traverser: Empowering 3D Generative Model with\n  Shape Editing Capability","summary":"  Shape generation is the practice of producing 3D shapes as various\nrepresentations for 3D content creation. Previous studies on 3D shape\ngeneration have focused on shape quality and structure, without or less\nconsidering the importance of semantic information. Consequently, such\ngenerative models often fail to preserve the semantic consistency of shape\nstructure or enable manipulation of the semantic attributes of shapes during\ngeneration. In this paper, we proposed a novel semantic generative model named\n3D Semantic Subspace Traverser that utilizes semantic attributes for\ncategory-specific 3D shape generation and editing. Our method utilizes implicit\nfunctions as the 3D shape representation and combines a novel latent-space GAN\nwith a linear subspace model to discover semantic dimensions in the local\nlatent space of 3D shapes. Each dimension of the subspace corresponds to a\nparticular semantic attribute, and we can edit the attributes of generated\nshapes by traversing the coefficients of those dimensions. Experimental results\ndemonstrate that our method can produce plausible shapes with complex\nstructures and enable the editing of semantic attributes. The code and trained\nmodels are available at\nhttps://github.com/TrepangCat/3D_Semantic_Subspace_Traverser\n","authors":["Ruowei Wang","Yu Liu","Pei Su","Jianwei Zhang","Qijun Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.14051v1.pdf","comment":"Published in ICCV 2023. Code:\n  https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser"},{"id":"http://arxiv.org/abs/2211.15595v3","updated":"2023-07-26T08:50:12Z","published":"2022-11-28T17:49:46Z","title":"FsaNet: Frequency Self-attention for Semantic Segmentation","summary":"  Considering the spectral properties of images, we propose a new\nself-attention mechanism with highly reduced computational complexity, up to a\nlinear rate. To better preserve edges while promoting similarity within\nobjects, we propose individualized processes over different frequency bands. In\nparticular, we study a case where the process is merely over low-frequency\ncomponents. By ablation study, we show that low frequency self-attention can\nachieve very close or better performance relative to full frequency even\nwithout retraining the network. Accordingly, we design and embed novel\nplug-and-play modules to the head of a CNN network that we refer to as FsaNet.\nThe frequency self-attention 1) requires only a few low frequency coefficients\nas input, 2) can be mathematically equivalent to spatial domain self-attention\nwith linear structures, 3) simplifies token mapping ($1\\times1$ convolution)\nstage and token mixing stage simultaneously. We show that frequency\nself-attention requires $87.29\\% \\sim 90.04\\%$ less memory, $96.13\\% \\sim\n98.07\\%$ less FLOPs, and $97.56\\% \\sim 98.18\\%$ in run time than the regular\nself-attention. Compared to other ResNet101-based self-attention networks,\n\\ourM achieves a new \\sArt result ($83.0\\%$ mIoU) on Cityscape test dataset and\ncompetitive results on ADE20k and VOCaug. \\ourM can also enhance MASK R-CNN for\ninstance segmentation on COCO. In addition, utilizing the proposed module,\nSegformer can be boosted on a series of models with different scales, and\nSegformer-B5 can be improved even without retraining. Code is accessible at\n\\url{https://github.com/zfy-csu/FsaNet\n","authors":["Fengyu Zhang","Ashkan Panahi","Guangjun Gao"],"pdf_url":"https://arxiv.org/pdf/2211.15595v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14708v2","updated":"2023-07-26T08:44:50Z","published":"2023-05-24T04:25:51Z","title":"EgoVSR: Towards High-Quality Egocentric Video Super-Resolution","summary":"  Due to the limitations of capture devices and scenarios, egocentric videos\nfrequently have low visual quality, mainly caused by high compression and\nsevere motion blur. With the increasing application of egocentric videos, there\nis an urgent need to enhance the quality of these videos through\nsuper-resolution. However, existing Video Super-Resolution (VSR) works,\nfocusing on third-person view videos, are actually unsuitable for handling\nblurring artifacts caused by rapid ego-motion and object motion in egocentric\nvideos. To this end, we propose EgoVSR, a VSR framework specifically designed\nfor egocentric videos. We explicitly tackle motion blurs in egocentric videos\nusing a Dual Branch Deblur Network (DB$^2$Net) in the VSR framework. Meanwhile,\na blurring mask is introduced to guide the DB$^2$Net learning, and can be used\nto localize blurred areas in video frames. We also design a MaskNet to predict\nthe mask, as well as a mask loss to optimize the mask estimation. Additionally,\nan online motion blur synthesis model for common VSR training data is proposed\nto simulate motion blurs as in egocentric videos. In order to validate the\neffectiveness of our proposed method, we introduce an EgoVSR dataset containing\na large amount of fast-motion egocentric video sequences. Extensive experiments\ndemonstrate that our EgoVSR model can efficiently super-resolve low-quality\negocentric videos and outperform strong comparison baselines. Our code,\npre-trained models and data can be found at https://github.com/chiyich/EGOVSR/.\n","authors":["Yichen Chi","Junhao Gu","Jiamiao Zhang","Wenming Yang","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2305.14708v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14039v1","updated":"2023-07-26T08:43:12Z","published":"2023-07-26T08:43:12Z","title":"Controllable Guide-Space for Generalizable Face Forgery Detection","summary":"  Recent studies on face forgery detection have shown satisfactory performance\nfor methods involved in training datasets, but are not ideal enough for unknown\ndomains. This motivates many works to improve the generalization, but\nforgery-irrelevant information, such as image background and identity, still\nexists in different domain features and causes unexpected clustering, limiting\nthe generalization. In this paper, we propose a controllable guide-space (GS)\nmethod to enhance the discrimination of different forgery domains, so as to\nincrease the forgery relevance of features and thereby improve the\ngeneralization. The well-designed guide-space can simultaneously achieve both\nthe proper separation of forgery domains and the large distance between\nreal-forgery domains in an explicit and controllable manner. Moreover, for\nbetter discrimination, we use a decoupling module to weaken the interference of\nforgery-irrelevant correlations between domains. Furthermore, we make\nadjustments to the decision boundary manifold according to the clustering\ndegree of the same domain features within the neighborhood. Extensive\nexperiments in multiple in-domain and cross-domain settings confirm that our\nmethod can achieve state-of-the-art generalization.\n","authors":["Ying Guo","Cheng Zhen","Pengfei Yan"],"pdf_url":"https://arxiv.org/pdf/2307.14039v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14030v1","updated":"2023-07-26T08:25:46Z","published":"2023-07-26T08:25:46Z","title":"Consensus-Adaptive RANSAC","summary":"  RANSAC and its variants are widely used for robust estimation, however, they\ncommonly follow a greedy approach to finding the highest scoring model while\nignoring other model hypotheses. In contrast, Iteratively Reweighted Least\nSquares (IRLS) techniques gradually approach the model by iteratively updating\nthe weight of each correspondence based on the residuals from previous\niterations. Inspired by these methods, we propose a new RANSAC framework that\nlearns to explore the parameter space by considering the residuals seen so far\nvia a novel attention layer. The attention mechanism operates on a batch of\npoint-to-model residuals, and updates a per-point estimation state to take into\naccount the consensus found through a lightweight one-step transformer. This\nrich state then guides the minimal sampling between iterations as well as the\nmodel refinement. We evaluate the proposed approach on essential and\nfundamental matrix estimation on a number of indoor and outdoor datasets. It\noutperforms state-of-the-art estimators by a significant margin adding only a\nsmall runtime overhead. Moreover, we demonstrate good generalization properties\nof our trained model, indicating its effectiveness across different datasets\nand tasks. The proposed attention mechanism and one-step transformer provide an\nadaptive behavior that enhances the performance of RANSAC, making it a more\neffective tool for robust estimation. Code is available at\nhttps://github.com/cavalli1234/CA-RANSAC.\n","authors":["Luca Cavalli","Daniel Barath","Marc Pollefeys","Viktor Larsson"],"pdf_url":"https://arxiv.org/pdf/2307.14030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06710v3","updated":"2023-07-26T08:24:10Z","published":"2023-05-11T10:36:52Z","title":"Null-text Guidance in Diffusion Models is Secretly a Cartoon-style\n  Creator","summary":"  Classifier-free guidance is an effective sampling technique in diffusion\nmodels that has been widely adopted. The main idea is to extrapolate the model\nin the direction of text guidance and away from null-text guidance. In this\npaper, we demonstrate that null-text guidance in diffusion models is secretly a\ncartoon-style creator, i.e., the generated images can be efficiently\ntransformed into cartoons by simply perturbing the null-text guidance.\nSpecifically, we proposed two disturbance methods, i.e., Rollback disturbance\n(Back-D) and Image disturbance (Image-D), to construct misalignment between the\nnoisy images used for predicting null-text guidance and text guidance\n(subsequently referred to as \\textbf{null-text noisy image} and \\textbf{text\nnoisy image} respectively) in the sampling process. Back-D achieves\ncartoonization by altering the noise level of null-text noisy image via\nreplacing $x_t$ with $x_{t+\\Delta t}$. Image-D, alternatively, produces\nhigh-fidelity, diverse cartoons by defining $x_t$ as a clean input image, which\nfurther improves the incorporation of finer image details. Through\ncomprehensive experiments, we delved into the principle of noise disturbing for\nnull-text and uncovered that the efficacy of disturbance depends on the\ncorrelation between the null-text noisy image and the source image. Moreover,\nour proposed techniques, which can generate cartoon images and cartoonize\nspecific ones, are training-free and easily integrated as a plug-and-play\ncomponent in any classifier-free guided diffusion model. Project page is\navailable at \\url{https://nulltextforcartoon.github.io/}.\n","authors":["Jing Zhao","Heliang Zheng","Chaoyue Wang","Long Lan","Wanrong Huang","Wenjing Yang"],"pdf_url":"https://arxiv.org/pdf/2305.06710v3.pdf","comment":"Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2103.05423v3","updated":"2023-07-26T08:14:39Z","published":"2021-03-09T13:58:35Z","title":"Deep Learning Based 3D Segmentation: A Survey","summary":"  3D segmentation is a fundamental and challenging problem in computer vision\nwith applications in autonomous driving, robotics, augmented reality and\nmedical image analysis. It has received significant attention from the computer\nvision, graphics and machine learning communities. Conventional methods for 3D\nsegmentation, based on hand-crafted features and machine learning classifiers,\nlack generalization ability. Driven by their success in 2D computer vision,\ndeep learning techniques have recently become the tool of choice for 3D\nsegmentation tasks. This has led to an influx of a large number of methods in\nthe literature that have been evaluated on different benchmark datasets.\nWhereas survey papers on RGB-D and point cloud segmentation exist, there is a\nlack of an in-depth and recent survey that covers all 3D data modalities and\napplication domains. This paper fills the gap and provides a comprehensive\nsurvey of the recent progress made in deep learning based 3D segmentation. It\ncovers over 180 works, analyzes their strengths and limitations and discusses\ntheir competitive results on benchmark datasets. The survey provides a summary\nof the most commonly used pipelines and finally highlights promising research\ndirections for the future.\n","authors":["Yong He","Hongshan Yu","Xiaoyan Liu","Zhengeng Yang","Wei Sun","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2103.05423v3.pdf","comment":"28 pages, 10 tables, 8 figures, update the transformer-based methods\n  for 3D segmentation"},{"id":"http://arxiv.org/abs/2307.14025v1","updated":"2023-07-26T08:14:18Z","published":"2023-07-26T08:14:18Z","title":"Topologically-Regularized Multiple Instance Learning for Red Blood Cell\n  Disease Classification","summary":"  Diagnosing rare anemia disorders using microscopic images is challenging for\nskilled specialists and machine-learning methods alike. Due to thousands of\ndisease-relevant cells in a single blood sample, this constitutes a complex\nmultiple-instance learning (MIL) problem. While the spatial neighborhood of red\nblood cells is not meaningful per se, the topology, i.e., the geometry of blood\nsamples as a whole, contains informative features to remedy typical MIL issues,\nsuch as vanishing gradients and overfitting when training on limited data. We\nthus develop a topology-based approach that extracts multi-scale topological\nfeatures from bags of single red blood cell images. The topological features\nare used to regularize the model, enforcing the preservation of characteristic\ntopological properties of the data. Applied to a dataset of 71 patients\nsuffering from rare anemia disorders with 521 microscopic images of red blood\ncells, our experiments show that topological regularization is an effective\nmethod that leads to more than 3% performance improvements for the automated\nclassification of rare anemia disorders based on single-cell images. This is\nthe first approach that uses topological properties for regularizing the MIL\nprocess.\n","authors":["Salome Kazeminia","Ario Sadafi","Asya Makhro","Anna Bogdanova","Carsten Marr","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2307.14025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14021v1","updated":"2023-07-26T08:06:40Z","published":"2023-07-26T08:06:40Z","title":"Retinotopy Inspired Brain Encoding Model and the All-for-One Training\n  Recipe","summary":"  Brain encoding models aim to predict brain voxel-wise responses to stimuli\nimages, replicating brain signals captured by neuroimaging techniques. There is\na large volume of publicly available data, but training a comprehensive brain\nencoding model is challenging. The main difficulties stem from a) diversity\nwithin individual brain, with functional heterogeneous brain regions; b)\ndiversity of brains from different subjects, due to genetic and developmental\ndifferences; c) diversity of imaging modalities and processing pipelines. We\nuse this diversity to our advantage by introducing the All-for-One training\nrecipe, which divides the challenging one-big-model problem into multiple small\nmodels, with the small models aggregating the knowledge while preserving the\ndistinction between the different functional regions. Agnostic of the training\nrecipe, we use biological knowledge of the brain, specifically retinotopy, to\nintroduce inductive bias to learn a 3D brain-to-image mapping that ensures a)\neach neuron knows which image regions and semantic levels to gather\ninformation, and b) no neurons are left behind in the model.\n  We pre-trained a brain encoding model using over one million data points from\nfive public datasets spanning three imaging modalities. To the best of our\nknowledge, this is the most comprehensive brain encoding model to the date. We\ndemonstrate the effectiveness of the pre-trained model as a drop-in replacement\nfor commonly used vision backbone models. Furthermore, we demonstrate the\napplication of the model to brain decoding. Code and the model checkpoint will\nbe made available.\n","authors":["Huzheng Yang","Jianbo Shi","James Gee"],"pdf_url":"https://arxiv.org/pdf/2307.14021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14019v1","updated":"2023-07-26T08:04:01Z","published":"2023-07-26T08:04:01Z","title":"One-Nearest Neighborhood Guides Inlier Estimation for Unsupervised Point\n  Cloud Registration","summary":"  The precision of unsupervised point cloud registration methods is typically\nlimited by the lack of reliable inlier estimation and self-supervised signal,\nespecially in partially overlapping scenarios. In this paper, we propose an\neffective inlier estimation method for unsupervised point cloud registration by\ncapturing geometric structure consistency between the source point cloud and\nits corresponding reference point cloud copy. Specifically, to obtain a high\nquality reference point cloud copy, an One-Nearest Neighborhood (1-NN) point\ncloud is generated by input point cloud. This facilitates matching map\nconstruction and allows for integrating dual neighborhood matching scores of\n1-NN point cloud and input point cloud to improve matching confidence.\nBenefiting from the high quality reference copy, we argue that the neighborhood\ngraph formed by inlier and its neighborhood should have consistency between\nsource point cloud and its corresponding reference copy. Based on this\nobservation, we construct transformation-invariant geometric structure\nrepresentations and capture geometric structure consistency to score the inlier\nconfidence for estimated correspondences between source point cloud and its\nreference copy. This strategy can simultaneously provide the reliable\nself-supervised signal for model optimization. Finally, we further calculate\ntransformation estimation by the weighted SVD algorithm with the estimated\ncorrespondences and corresponding inlier confidence. We train the proposed\nmodel in an unsupervised manner, and extensive experiments on synthetic and\nreal-world datasets illustrate the effectiveness of the proposed method.\n","authors":["Yongzhe Yuan","Yue Wu","Maoguo Gong","Qiguang Miao","A. K. Qin"],"pdf_url":"https://arxiv.org/pdf/2307.14019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14016v1","updated":"2023-07-26T07:57:56Z","published":"2023-07-26T07:57:56Z","title":"RPG-Palm: Realistic Pseudo-data Generation for Palmprint Recognition","summary":"  Palmprint recently shows great potential in recognition applications as it is\na privacy-friendly and stable biometric. However, the lack of large-scale\npublic palmprint datasets limits further research and development of palmprint\nrecognition. In this paper, we propose a novel realistic pseudo-palmprint\ngeneration (RPG) model to synthesize palmprints with massive identities. We\nfirst introduce a conditional modulation generator to improve the intra-class\ndiversity. Then an identity-aware loss is proposed to ensure identity\nconsistency against unpaired training. We further improve the B\\'ezier palm\ncreases generation strategy to guarantee identity independence. Extensive\nexperimental results demonstrate that synthetic pretraining significantly\nboosts the recognition model performance. For example, our model improves the\nstate-of-the-art B\\'ezierPalm by more than $5\\%$ and $14\\%$ in terms of\nTAR@FAR=1e-6 under the $1:1$ and $1:3$ Open-set protocol. When accessing only\n$10\\%$ of the real training data, our method still outperforms ArcFace with\n$100\\%$ real training data, indicating that we are closer to real-data-free\npalmprint recognition.\n","authors":["Lei Shen","Jianlong Jin","Ruixin Zhang","Huaen Li","Yingyi Zhang","Jingyun Zhang","Shouhong Ding","Yang Zhao","Wei Jia"],"pdf_url":"https://arxiv.org/pdf/2307.14016v1.pdf","comment":"12 pages,8 figures"},{"id":"http://arxiv.org/abs/2307.14010v1","updated":"2023-07-26T07:45:14Z","published":"2023-07-26T07:45:14Z","title":"ESSAformer: Efficient Transformer for Hyperspectral Image\n  Super-resolution","summary":"  Single hyperspectral image super-resolution (single-HSI-SR) aims to restore a\nhigh-resolution hyperspectral image from a low-resolution observation. However,\nthe prevailing CNN-based approaches have shown limitations in building\nlong-range dependencies and capturing interaction information between spectral\nfeatures. This results in inadequate utilization of spectral information and\nartifacts after upsampling. To address this issue, we propose ESSAformer, an\nESSA attention-embedded Transformer network for single-HSI-SR with an iterative\nrefining structure. Specifically, we first introduce a robust and\nspectral-friendly similarity metric, \\ie, the spectral correlation coefficient\nof the spectrum (SCC), to replace the original attention matrix and\nincorporates inductive biases into the model to facilitate training. Built upon\nit, we further utilize the kernelizable attention technique with theoretical\nsupport to form a novel efficient SCC-kernel-based self-attention (ESSA) and\nreduce attention computation to linear complexity. ESSA enlarges the receptive\nfield for features after upsampling without bringing much computation and\nallows the model to effectively utilize spatial-spectral information from\ndifferent scales, resulting in the generation of more natural high-resolution\nimages. Without the need for pretraining on large-scale datasets, our\nexperiments demonstrate ESSA's effectiveness in both visual quality and\nquantitative results.\n","authors":["Mingjin Zhang","Chi Zhang","Qiming Zhang","Jie Guo","Xinbo Gao","Jing Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.14010v1.pdf","comment":"16 pages, 18 figures"},{"id":"http://arxiv.org/abs/2307.14009v1","updated":"2023-07-26T07:44:34Z","published":"2023-07-26T07:44:34Z","title":"Car-Studio: Learning Car Radiance Fields from Single-View and Endless\n  In-the-wild Images","summary":"  Compositional neural scene graph studies have shown that radiance fields can\nbe an efficient tool in an editable autonomous driving simulator. However,\nprevious studies learned within a sequence of autonomous driving datasets,\nresulting in unsatisfactory blurring when rotating the car in the simulator. In\nthis letter, we propose a pipeline for learning unconstrained images and\nbuilding a dataset from processed images. To meet the requirements of the\nsimulator, which demands that the vehicle maintain clarity when the perspective\nchanges and that the contour remains sharp from the background to avoid\nartifacts when editing, we design a radiation field of the vehicle, a crucial\npart of the urban scene foreground. Through experiments, we demonstrate that\nour model achieves competitive performance compared to baselines. Using the\ndatasets built from in-the-wild images, our method gradually presents a\ncontrollable appearance editing function. We will release the dataset and code\non https://lty2226262.github.io/car-studio/ to facilitate further research in\nthe field.\n","authors":["Tianyu Liu","Hao Zhao","Yang Yu","Guyue Zhou","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2307.14009v1.pdf","comment":"Submissions to the IEEE Robotics and Automation Letters (RA-L),\n  Project Page: https://lty2226262.github.io/car-studio/"},{"id":"http://arxiv.org/abs/2307.08579v2","updated":"2023-07-26T07:44:32Z","published":"2023-07-17T15:47:48Z","title":"Scale-Aware Modulation Meet Transformer","summary":"  This paper presents a new vision Transformer, Scale-Aware Modulation\nTransformer (SMT), that can handle various downstream tasks efficiently by\ncombining the convolutional network and vision Transformer. The proposed\nScale-Aware Modulation (SAM) in the SMT includes two primary novel designs.\nFirstly, we introduce the Multi-Head Mixed Convolution (MHMC) module, which can\ncapture multi-scale features and expand the receptive field. Secondly, we\npropose the Scale-Aware Aggregation (SAA) module, which is lightweight but\neffective, enabling information fusion across different heads. By leveraging\nthese two modules, convolutional modulation is further enhanced. Furthermore,\nin contrast to prior works that utilized modulations throughout all stages to\nbuild an attention-free network, we propose an Evolutionary Hybrid Network\n(EHN), which can effectively simulate the shift from capturing local to global\ndependencies as the network becomes deeper, resulting in superior performance.\nExtensive experiments demonstrate that SMT significantly outperforms existing\nstate-of-the-art models across a wide range of visual tasks. Specifically, SMT\nwith 11.5M / 2.4GFLOPs and 32M / 7.7GFLOPs can achieve 82.2% and 84.3% top-1\naccuracy on ImageNet-1K, respectively. After pretrained on ImageNet-22K in\n224^2 resolution, it attains 87.1% and 88.1% top-1 accuracy when finetuned with\nresolution 224^2 and 384^2, respectively. For object detection with Mask R-CNN,\nthe SMT base trained with 1x and 3x schedule outperforms the Swin Transformer\ncounterpart by 4.2 and 1.3 mAP on COCO, respectively. For semantic segmentation\nwith UPerNet, the SMT base test at single- and multi-scale surpasses Swin by\n2.0 and 1.1 mIoU respectively on the ADE20K.\n","authors":["Weifeng Lin","Ziheng Wu","Jiayu Chen","Jun Huang","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2307.08579v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14008v1","updated":"2023-07-26T07:42:28Z","published":"2023-07-26T07:42:28Z","title":"Adaptive Frequency Filters As Efficient Global Token Mixers","summary":"  Recent vision transformers, large-kernel CNNs and MLPs have attained\nremarkable successes in broad vision tasks thanks to their effective\ninformation fusion in the global scope. However, their efficient deployments,\nespecially on mobile devices, still suffer from noteworthy challenges due to\nthe heavy computational costs of self-attention mechanisms, large kernels, or\nfully connected layers. In this work, we apply conventional convolution theorem\nto deep learning for addressing this and reveal that adaptive frequency filters\ncan serve as efficient global token mixers. With this insight, we propose\nAdaptive Frequency Filtering (AFF) token mixer. This neural operator transfers\na latent representation to the frequency domain via a Fourier transform and\nperforms semantic-adaptive frequency filtering via an elementwise\nmultiplication, which mathematically equals to a token mixing operation in the\noriginal latent space with a dynamic convolution kernel as large as the spatial\nresolution of this latent representation. We take AFF token mixers as primary\nneural operators to build a lightweight neural network, dubbed AFFNet.\nExtensive experiments demonstrate the effectiveness of our proposed AFF token\nmixer and show that AFFNet achieve superior accuracy and efficiency trade-offs\ncompared to other lightweight network designs on broad visual tasks, including\nvisual recognition and dense prediction tasks.\n","authors":["Zhipeng Huang","Zhizheng Zhang","Cuiling Lan","Zheng-Jun Zha","Yan Lu","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2307.14008v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.14006v1","updated":"2023-07-26T07:36:38Z","published":"2023-07-26T07:36:38Z","title":"Learning Snippet-to-Motion Progression for Skeleton-based Human Motion\n  Prediction","summary":"  Existing Graph Convolutional Networks to achieve human motion prediction\nlargely adopt a one-step scheme, which output the prediction straight from\nhistory input, failing to exploit human motion patterns. We observe that human\nmotions have transitional patterns and can be split into snippets\nrepresentative of each transition. Each snippet can be reconstructed from its\nstarting and ending poses referred to as the transitional poses. We propose a\nsnippet-to-motion multi-stage framework that breaks motion prediction into\nsub-tasks easier to accomplish. Each sub-task integrates three modules:\ntransitional pose prediction, snippet reconstruction, and snippet-to-motion\nprediction. Specifically, we propose to first predict only the transitional\nposes. Then we use them to reconstruct the corresponding snippets, obtaining a\nclose approximation to the true motion sequence. Finally we refine them to\nproduce the final prediction output. To implement the network, we propose a\nnovel unified graph modeling, which allows for direct and effective feature\npropagation compared to existing approaches which rely on separate space-time\nmodeling. Extensive experiments on Human 3.6M, CMU Mocap and 3DPW datasets\nverify the effectiveness of our method which achieves state-of-the-art\nperformance.\n","authors":["Xinshun Wang","Qiongjie Cui","Chen Chen","Shen Zhao","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2307.14006v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15786v3","updated":"2023-07-26T07:21:59Z","published":"2023-03-28T07:54:54Z","title":"HOICLIP: Efficient Knowledge Transfer for HOI Detection with\n  Vision-Language Models","summary":"  Human-Object Interaction (HOI) detection aims to localize human-object pairs\nand recognize their interactions. Recently, Contrastive Language-Image\nPre-training (CLIP) has shown great potential in providing interaction prior\nfor HOI detectors via knowledge distillation. However, such approaches often\nrely on large-scale training data and suffer from inferior performance under\nfew/zero-shot scenarios. In this paper, we propose a novel HOI detection\nframework that efficiently extracts prior knowledge from CLIP and achieves\nbetter generalization. In detail, we first introduce a novel interaction\ndecoder to extract informative regions in the visual feature map of CLIP via a\ncross-attention mechanism, which is then fused with the detection backbone by a\nknowledge integration block for more accurate human-object pair detection. In\naddition, prior knowledge in CLIP text encoder is leveraged to generate a\nclassifier by embedding HOI descriptions. To distinguish fine-grained\ninteractions, we build a verb classifier from training data via visual semantic\narithmetic and a lightweight verb representation adapter. Furthermore, we\npropose a training-free enhancement to exploit global HOI predictions from\nCLIP. Extensive experiments demonstrate that our method outperforms the state\nof the art by a large margin on various settings, e.g. +4.04 mAP on HICO-Det.\nThe source code is available in https://github.com/Artanic30/HOICLIP.\n","authors":["Shan Ning","Longtian Qiu","Yongfei Liu","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2303.15786v3.pdf","comment":"CVPR 2023.Open sourced, Code and Model Available"},{"id":"http://arxiv.org/abs/2303.15435v2","updated":"2023-07-26T07:19:58Z","published":"2023-03-27T17:57:33Z","title":"The Stable Signature: Rooting Watermarks in Latent Diffusion Models","summary":"  Generative image modeling enables a wide range of applications but raises\nethical concerns about responsible deployment. This paper introduces an active\nstrategy combining image watermarking and Latent Diffusion Models. The goal is\nfor all generated images to conceal an invisible watermark allowing for future\ndetection and/or identification. The method quickly fine-tunes the latent\ndecoder of the image generator, conditioned on a binary signature. A\npre-trained watermark extractor recovers the hidden signature from any\ngenerated image and a statistical test then determines whether it comes from\nthe generative model. We evaluate the invisibility and robustness of the\nwatermarks on a variety of generation tasks, showing that Stable Signature\nworks even after the images are modified. For instance, it detects the origin\nof an image generated from a text prompt, then cropped to keep $10\\%$ of the\ncontent, with $90$+$\\%$ accuracy at a false positive rate below 10$^{-6}$.\n","authors":["Pierre Fernandez","Guillaume Couairon","Hervé Jégou","Matthijs Douze","Teddy Furon"],"pdf_url":"https://arxiv.org/pdf/2303.15435v2.pdf","comment":"Published at ICCV 2023. Code at\n  https://github.com/facebookresearch/stable_signature - webpage at\n  https://pierrefdz.github.io/publications/stablesignature"},{"id":"http://arxiv.org/abs/2307.13992v1","updated":"2023-07-26T07:01:57Z","published":"2023-07-26T07:01:57Z","title":"Causal reasoning in typical computer vision tasks","summary":"  Deep learning has revolutionized the field of artificial intelligence. Based\non the statistical correlations uncovered by deep learning-based methods,\ncomputer vision technology has contributed to tremendous growth in areas such\nas autonomous driving and robotics. Despite being the basis of deep learning,\nsuch correlation is not stable and is susceptible to uncontrolled factors. In\nthe absence of the guidance of prior knowledge, statistical correlations can\neasily turn into spurious correlations and cause confounders. As a result,\nresearchers are beginning to refine deep learning-based methods with causal\ntheory. Causal theory models the intrinsic causal structure unaffected by data\nbias and is effective in avoiding spurious correlations. This paper aims to\ncomprehensively review the existing causal methods in typical vision and\nvision-language tasks such as semantic segmentation, object detection, and\nimage captioning. The advantages of causality and the approaches for building\ncausal paradigms will be summarized. Future roadmaps are also proposed,\nincluding facilitating the development of causal theory and its application in\nother complex scenes and systems.\n","authors":[" Zhang"," Kexuan"," Sun"," Qiyu"," Zhao"," Chaoqiang"," Tang"," Yang"],"pdf_url":"https://arxiv.org/pdf/2307.13992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13991v1","updated":"2023-07-26T06:58:19Z","published":"2023-07-26T06:58:19Z","title":"METAVerse: Meta-Learning Traversability Cost Map for Off-Road Navigation","summary":"  Autonomous navigation in off-road conditions requires an accurate estimation\nof terrain traversability. However, traversability estimation in unstructured\nenvironments is subject to high uncertainty due to the variability of numerous\nfactors that influence vehicle-terrain interaction. Consequently, it is\nchallenging to obtain a generalizable model that can accurately predict\ntraversability in a variety of environments. This paper presents METAVerse, a\nmeta-learning framework for learning a global model that accurately and\nreliably predicts terrain traversability across diverse environments. We train\nthe traversability prediction network to generate a dense and continuous-valued\ncost map from a sparse LiDAR point cloud, leveraging vehicle-terrain\ninteraction feedback in a self-supervised manner. Meta-learning is utilized to\ntrain a global model with driving data collected from multiple environments,\neffectively minimizing estimation uncertainty. During deployment, online\nadaptation is performed to rapidly adapt the network to the local environment\nby exploiting recent interaction experiences. To conduct a comprehensive\nevaluation, we collect driving data from various terrains and demonstrate that\nour method can obtain a global model that minimizes uncertainty. Moreover, by\nintegrating our model with a model predictive controller, we demonstrate that\nthe reduced uncertainty results in safe and stable navigation in unstructured\nand unknown terrains.\n","authors":["Junwon Seo","Taekyung Kim","Seongyong Ahn","Kiho Kwak"],"pdf_url":"https://arxiv.org/pdf/2307.13991v1.pdf","comment":"Our video can be found at https://youtu.be/4rIAMM1ZKMo"},{"id":"http://arxiv.org/abs/2307.13986v1","updated":"2023-07-26T06:52:29Z","published":"2023-07-26T06:52:29Z","title":"Hybrid Representation-Enhanced Sampling for Bayesian Active Learning in\n  Musculoskeletal Segmentation of Lower Extremities","summary":"  Purpose: Obtaining manual annotations to train deep learning (DL) models for\nauto-segmentation is often time-consuming. Uncertainty-based Bayesian active\nlearning (BAL) is a widely-adopted method to reduce annotation efforts. Based\non BAL, this study introduces a hybrid representation-enhanced sampling\nstrategy that integrates density and diversity criteria to save manual\nannotation costs by efficiently selecting the most informative samples.\n  Methods: The experiments are performed on two lower extremity (LE) datasets\nof MRI and CT images by a BAL framework based on Bayesian U-net. Our method\nselects uncertain samples with high density and diversity for manual revision,\noptimizing for maximal similarity to unlabeled instances and minimal similarity\nto existing training data. We assess the accuracy and efficiency using Dice and\na proposed metric called reduced annotation cost (RAC), respectively. We\nfurther evaluate the impact of various acquisition rules on BAL performance and\ndesign an ablation study for effectiveness estimation.\n  Results: The proposed method showed superiority or non-inferiority to other\nmethods on both datasets across two acquisition rules, and quantitative results\nreveal the pros and cons of the acquisition rules. Our ablation study in\nvolume-wise acquisition shows that the combination of density and diversity\ncriteria outperforms solely using either of them in musculoskeletal\nsegmentation.\n  Conclusion: Our sampling method is proven efficient in reducing annotation\ncosts in image segmentation tasks. The combination of the proposed method and\nour BAL framework provides a semi-automatic way for efficient annotation of\nmedical image datasets.\n","authors":["Ganping Li","Yoshito Otake","Mazen Soufi","Masashi Taniguchi","Masahide Yagi","Noriaki Ichihashi","Keisuke Uemura","Masaki Takao","Nobuhiko Sugano","Yoshinobu Sato"],"pdf_url":"https://arxiv.org/pdf/2307.13986v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.13985v1","updated":"2023-07-26T06:50:58Z","published":"2023-07-26T06:50:58Z","title":"Enhanced Security against Adversarial Examples Using a Random Ensemble\n  of Encrypted Vision Transformer Models","summary":"  Deep neural networks (DNNs) are well known to be vulnerable to adversarial\nexamples (AEs). In addition, AEs have adversarial transferability, which means\nAEs generated for a source model can fool another black-box model (target\nmodel) with a non-trivial probability. In previous studies, it was confirmed\nthat the vision transformer (ViT) is more robust against the property of\nadversarial transferability than convolutional neural network (CNN) models such\nas ConvMixer, and moreover encrypted ViT is more robust than ViT without any\nencryption. In this article, we propose a random ensemble of encrypted ViT\nmodels to achieve much more robust models. In experiments, the proposed scheme\nis verified to be more robust against not only black-box attacks but also\nwhite-box ones than convention methods.\n","authors":["Ryota Iijima","Miki Tanaka","Sayaka Shiota","Hitoshi Kiya"],"pdf_url":"https://arxiv.org/pdf/2307.13985v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2307.13981v1","updated":"2023-07-26T06:38:33Z","published":"2023-07-26T06:38:33Z","title":"Analysis of Video Quality Datasets via Design of Minimalistic Video\n  Quality Models","summary":"  Blind video quality assessment (BVQA) plays an indispensable role in\nmonitoring and improving the end-users' viewing experience in various\nreal-world video-enabled media applications. As an experimental field, the\nimprovements of BVQA models have been measured primarily on a few human-rated\nVQA datasets. Thus, it is crucial to gain a better understanding of existing\nVQA datasets in order to properly evaluate the current progress in BVQA.\nTowards this goal, we conduct a first-of-its-kind computational analysis of VQA\ndatasets via designing minimalistic BVQA models. By minimalistic, we restrict\nour family of BVQA models to build only upon basic blocks: a video preprocessor\n(for aggressive spatiotemporal downsampling), a spatial quality analyzer, an\noptional temporal quality analyzer, and a quality regressor, all with the\nsimplest possible instantiations. By comparing the quality prediction\nperformance of different model variants on eight VQA datasets with realistic\ndistortions, we find that nearly all datasets suffer from the easy dataset\nproblem of varying severity, some of which even admit blind image quality\nassessment (BIQA) solutions. We additionally justify our claims by contrasting\nour model generalizability on these VQA datasets, and by ablating a dizzying\nset of BVQA design choices related to the basic building blocks. Our results\ncast doubt on the current progress in BVQA, and meanwhile shed light on good\npractices of constructing next-generation VQA datasets and models.\n","authors":["Wei Sun","Wen Wen","Xiongkuo Min","Long Lan","Guangtao Zhai","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2307.13981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13978v1","updated":"2023-07-26T06:34:24Z","published":"2023-07-26T06:34:24Z","title":"Controlling the Latent Space of GANs through Reinforcement Learning: A\n  Case Study on Task-based Image-to-Image Translation","summary":"  Generative Adversarial Networks (GAN) have emerged as a formidable AI tool to\ngenerate realistic outputs based on training datasets. However, the challenge\nof exerting control over the generation process of GANs remains a significant\nhurdle. In this paper, we propose a novel methodology to address this issue by\nintegrating a reinforcement learning (RL) agent with a latent-space GAN\n(l-GAN), thereby facilitating the generation of desired outputs. More\nspecifically, we have developed an actor-critic RL agent with a meticulously\ndesigned reward policy, enabling it to acquire proficiency in navigating the\nlatent space of the l-GAN and generating outputs based on specified tasks. To\nsubstantiate the efficacy of our approach, we have conducted a series of\nexperiments employing the MNIST dataset, including arithmetic addition as an\nillustrative task. The outcomes of these experiments serve to validate our\nmethodology. Our pioneering integration of an RL agent with a GAN model\nrepresents a novel advancement, holding great potential for enhancing\ngenerative networks in the future.\n","authors":["Mahyar Abbasian","Taha Rajabzadeh","Ahmadreza Moradipari","Seyed Amir Hossein Aqajari","Hongsheng Lu","Amir Rahmani"],"pdf_url":"https://arxiv.org/pdf/2307.13978v1.pdf","comment":"7 pages, 7 figures, 2 tables, conference paper"},{"id":"http://arxiv.org/abs/2307.13974v1","updated":"2023-07-26T06:19:46Z","published":"2023-07-26T06:19:46Z","title":"Tracking Anything in High Quality","summary":"  Visual object tracking is a fundamental video task in computer vision.\nRecently, the notably increasing power of perception algorithms allows the\nunification of single/multiobject and box/mask-based tracking. Among them, the\nSegment Anything Model (SAM) attracts much attention. In this report, we\npropose HQTrack, a framework for High Quality Tracking anything in videos.\nHQTrack mainly consists of a video multi-object segmenter (VMOS) and a mask\nrefiner (MR). Given the object to be tracked in the initial frame of a video,\nVMOS propagates the object masks to the current frame. The mask results at this\nstage are not accurate enough since VMOS is trained on several closeset video\nobject segmentation (VOS) datasets, which has limited ability to generalize to\ncomplex and corner scenes. To further improve the quality of tracking masks, a\npretrained MR model is employed to refine the tracking results. As a compelling\ntestament to the effectiveness of our paradigm, without employing any tricks\nsuch as test-time data augmentations and model ensemble, HQTrack ranks the 2nd\nplace in the Visual Object Tracking and Segmentation (VOTS2023) challenge. Code\nand models are available at https://github.com/jiawen-zhu/HQTrack.\n","authors":["Jiawen Zhu","Zhenyu Chen","Zeqi Hao","Shijie Chang","Lu Zhang","Dong Wang","Huchuan Lu","Bin Luo","Jun-Yan He","Jin-Peng Lan","Hanyuan Chen","Chenyang Li"],"pdf_url":"https://arxiv.org/pdf/2307.13974v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2306.17723v3","updated":"2023-07-26T05:10:57Z","published":"2023-06-30T15:11:00Z","title":"FlipNeRF: Flipped Reflection Rays for Few-shot Novel View Synthesis","summary":"  Neural Radiance Field (NeRF) has been a mainstream in novel view synthesis\nwith its remarkable quality of rendered images and simple architecture.\nAlthough NeRF has been developed in various directions improving continuously\nits performance, the necessity of a dense set of multi-view images still exists\nas a stumbling block to progress for practical application. In this work, we\npropose FlipNeRF, a novel regularization method for few-shot novel view\nsynthesis by utilizing our proposed flipped reflection rays. The flipped\nreflection rays are explicitly derived from the input ray directions and\nestimated normal vectors, and play a role of effective additional training rays\nwhile enabling to estimate more accurate surface normals and learn the 3D\ngeometry effectively. Since the surface normal and the scene depth are both\nderived from the estimated densities along a ray, the accurate surface normal\nleads to more exact depth estimation, which is a key factor for few-shot novel\nview synthesis. Furthermore, with our proposed Uncertainty-aware Emptiness Loss\nand Bottleneck Feature Consistency Loss, FlipNeRF is able to estimate more\nreliable outputs with reducing floating artifacts effectively across the\ndifferent scene structures, and enhance the feature-level consistency between\nthe pair of the rays cast toward the photo-consistent pixels without any\nadditional feature extractor, respectively. Our FlipNeRF achieves the SOTA\nperformance on the multiple benchmarks across all the scenarios.\n","authors":["Seunghyeon Seo","Yeonjin Chang","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2306.17723v3.pdf","comment":"ICCV 2023. Project Page: https://shawn615.github.io/flipnerf/"},{"id":"http://arxiv.org/abs/2307.13958v1","updated":"2023-07-26T05:06:41Z","published":"2023-07-26T05:06:41Z","title":"Visual Prompt Flexible-Modal Face Anti-Spoofing","summary":"  Recently, vision transformer based multimodal learning methods have been\nproposed to improve the robustness of face anti-spoofing (FAS) systems.\nHowever, multimodal face data collected from the real world is often imperfect\ndue to missing modalities from various imaging sensors. Recently,\nflexible-modal FAS~\\cite{yu2023flexible} has attracted more attention, which\naims to develop a unified multimodal FAS model using complete multimodal face\ndata but is insensitive to test-time missing modalities. In this paper, we\ntackle one main challenge in flexible-modal FAS, i.e., when missing modality\noccurs either during training or testing in real-world situations. Inspired by\nthe recent success of the prompt learning in language models, we propose\n\\textbf{V}isual \\textbf{P}rompt flexible-modal \\textbf{FAS} (VP-FAS), which\nlearns the modal-relevant prompts to adapt the frozen pre-trained foundation\nmodel to downstream flexible-modal FAS task. Specifically, both vanilla visual\nprompts and residual contextual prompts are plugged into multimodal\ntransformers to handle general missing-modality cases, while only requiring\nless than 4\\% learnable parameters compared to training the entire model.\nFurthermore, missing-modality regularization is proposed to force models to\nlearn consistent multimodal feature embeddings when missing partial modalities.\nExtensive experiments conducted on two multimodal FAS benchmark datasets\ndemonstrate the effectiveness of our VP-FAS framework that improves the\nperformance under various missing-modality cases while alleviating the\nrequirement of heavy model re-training.\n","authors":["Zitong Yu","Rizhao Cai","Yawen Cui","Ajian Liu","Changsheng Chen"],"pdf_url":"https://arxiv.org/pdf/2307.13958v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.03369 by other authors"},{"id":"http://arxiv.org/abs/2307.13957v1","updated":"2023-07-26T04:33:05Z","published":"2023-07-26T04:33:05Z","title":"Heterogeneous Embodied Multi-Agent Collaboration","summary":"  Multi-agent embodied tasks have recently been studied in complex indoor\nvisual environments. Collaboration among multiple agents can improve work\nefficiency and has significant practical value. However, most of the existing\nresearch focuses on homogeneous multi-agent tasks. Compared with homogeneous\nagents, heterogeneous agents can leverage their different capabilities to\nallocate corresponding sub-tasks and cooperate to complete complex tasks.\nHeterogeneous multi-agent tasks are common in real-world scenarios, and the\ncollaboration strategy among heterogeneous agents is a challenging and\nimportant problem to be solved. To study collaboration among heterogeneous\nagents, we propose the heterogeneous multi-agent tidying-up task, in which\nmultiple heterogeneous agents with different capabilities collaborate with each\nother to detect misplaced objects and place them in reasonable locations. This\nis a demanding task since it requires agents to make the best use of their\ndifferent capabilities to conduct reasonable task planning and complete the\nwhole task. To solve this task, we build a heterogeneous multi-agent tidying-up\nbenchmark dataset in a large number of houses with multiple rooms based on\nProcTHOR-10K. We propose the hierarchical decision model based on misplaced\nobject detection, reasonable receptacle prediction, as well as the\nhandshake-based group communication mechanism. Extensive experiments are\nconducted to demonstrate the effectiveness of the proposed model. The project's\nwebsite and videos of experiments can be found at https://hetercol.github.io/.\n","authors":["Xinzhu Liu","Di Guo","Huaping Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13953v1","updated":"2023-07-26T04:08:12Z","published":"2023-07-26T04:08:12Z","title":"The Hidden Dance of Phonemes and Visage: Unveiling the Enigmatic Link\n  between Phonemes and Facial Features","summary":"  This work unveils the enigmatic link between phonemes and facial features.\nTraditional studies on voice-face correlations typically involve using a long\nperiod of voice input, including generating face images from voices and\nreconstructing 3D face meshes from voices. However, in situations like\nvoice-based crimes, the available voice evidence may be short and limited.\nAdditionally, from a physiological perspective, each segment of speech --\nphoneme -- corresponds to different types of airflow and movements in the face.\nTherefore, it is advantageous to discover the hidden link between phonemes and\nface attributes. In this paper, we propose an analysis pipeline to help us\nexplore the voice-face relationship in a fine-grained manner, i.e., phonemes\nv.s. facial anthropometric measurements (AM). We build an estimator for each\nphoneme-AM pair and evaluate the correlation through hypothesis testing. Our\nresults indicate that AMs are more predictable from vowels compared to\nconsonants, particularly with plosives. Additionally, we observe that if a\nspecific AM exhibits more movement during phoneme pronunciation, it is more\npredictable. Our findings support those in physiology regarding correlation and\nlay the groundwork for future research on speech-face multimodal learning.\n","authors":["Liao Qu","Xianwei Zou","Xiang Li","Yandong Wen","Rita Singh","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2307.13953v1.pdf","comment":"Interspeech 2023"},{"id":"http://arxiv.org/abs/2008.07073v2","updated":"2023-07-26T04:03:47Z","published":"2020-08-17T03:31:39Z","title":"AlphaNet: Improving Long-Tail Classification By Combining Classifiers","summary":"  Methods in long-tail learning focus on improving performance for data-poor\n(rare) classes; however, performance for such classes remains much lower than\nperformance for more data-rich (frequent) classes. Analyzing the predictions of\nlong-tail methods for rare classes reveals that a large number of errors are\ndue to misclassification of rare items as visually similar frequent classes. To\naddress this problem, we introduce AlphaNet, a method that can be applied to\nexisting models, performing post hoc correction on classifiers of rare classes.\nStarting with a pre-trained model, we find frequent classes that are closest to\nrare classes in the model's representation space and learn weights to update\nrare class classifiers with a linear combination of frequent class classifiers.\nAlphaNet, applied to several models, greatly improves test accuracy for rare\nclasses in multiple long-tailed datasets, with very little change to overall\naccuracy. Our method also provides a way to control the trade-off between rare\nclass and overall accuracy, making it practical for long-tail classification in\nthe wild.\n","authors":["Nadine Chang","Jayanth Koushik","Aarti Singh","Martial Hebert","Yu-Xiong Wang","Michael J. Tarr"],"pdf_url":"https://arxiv.org/pdf/2008.07073v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13948v1","updated":"2023-07-26T04:03:10Z","published":"2023-07-26T04:03:10Z","title":"Rethinking Voice-Face Correlation: A Geometry View","summary":"  Previous works on voice-face matching and voice-guided face synthesis\ndemonstrate strong correlations between voice and face, but mainly rely on\ncoarse semantic cues such as gender, age, and emotion. In this paper, we aim to\ninvestigate the capability of reconstructing the 3D facial shape from voice\nfrom a geometry perspective without any semantic information. We propose a\nvoice-anthropometric measurement (AM)-face paradigm, which identifies\npredictable facial AMs from the voice and uses them to guide 3D face\nreconstruction. By leveraging AMs as a proxy to link the voice and face\ngeometry, we can eliminate the influence of unpredictable AMs and make the face\ngeometry tractable. Our approach is evaluated on our proposed dataset with\nground-truth 3D face scans and corresponding voice recordings, and we find\nsignificant correlations between voice and specific parts of the face geometry,\nsuch as the nasal cavity and cranium. Our work offers a new perspective on\nvoice-face correlation and can serve as a good empirical study for\nanthropometry science.\n","authors":["Xiang Li","Yandong Wen","Muqiao Yang","Jinglu Wang","Rita Singh","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2307.13948v1.pdf","comment":"ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2307.13947v1","updated":"2023-07-26T04:01:57Z","published":"2023-07-26T04:01:57Z","title":"Centroid-aware feature recalibration for cancer grading in pathology\n  images","summary":"  Cancer grading is an essential task in pathology. The recent developments of\nartificial neural networks in computational pathology have shown that these\nmethods hold great potential for improving the accuracy and quality of cancer\ndiagnosis. However, the issues with the robustness and reliability of such\nmethods have not been fully resolved yet. Herein, we propose a centroid-aware\nfeature recalibration network that can conduct cancer grading in an accurate\nand robust manner. The proposed network maps an input pathology image into an\nembedding space and adjusts it by using centroids embedding vectors of\ndifferent cancer grades via attention mechanism. Equipped with the recalibrated\nembedding vector, the proposed network classifiers the input pathology image\ninto a pertinent class label, i.e., cancer grade. We evaluate the proposed\nnetwork using colorectal cancer datasets that were collected under different\nenvironments. The experimental results confirm that the proposed network is\nable to conduct cancer grading in pathology images with high accuracy\nregardless of the environmental changes in the datasets.\n","authors":["Jaeung Lee","Keunho Byeon","Jin Tae Kwak"],"pdf_url":"https://arxiv.org/pdf/2307.13947v1.pdf","comment":"MICCAI 2023; 10 pages; 1 figure; Project code:\n  https://github.com/colin19950703/CaFeNet"},{"id":"http://arxiv.org/abs/2206.02070v2","updated":"2023-07-26T03:53:19Z","published":"2022-06-04T23:33:34Z","title":"Priors in Deep Image Restoration and Enhancement: A Survey","summary":"  Image restoration and enhancement is a process of improving the image quality\nby removing degradations, such as noise, blur, and resolution degradation. Deep\nlearning (DL) has recently been applied to image restoration and enhancement.\nDue to its ill-posed property, plenty of works have been explored priors to\nfacilitate training deep neural networks (DNNs). However, the importance of\npriors has not been systematically studied and analyzed by far in the research\ncommunity. Therefore, this paper serves as the first study that provides a\ncomprehensive overview of recent advancements in priors for deep image\nrestoration and enhancement. Our work covers five primary contents: (1) A\ntheoretical analysis of priors for deep image restoration and enhancement; (2)\nA hierarchical and structural taxonomy of priors commonly used in the DL-based\nmethods; (3) An insightful discussion on each prior regarding its principle,\npotential, and applications; (4) A summary of crucial problems by highlighting\nthe potential future directions, especially adopting the large-scale foundation\nmodels as prior, to spark more research in the community; (5) An open-source\nrepository that provides a taxonomy of all mentioned works and code links.\n","authors":["Yunfan Lu","Yiqi Lin","Hao Wu","Yunhao Luo","Xu Zheng","Hui Xiong","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2206.02070v2.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2303.06842v2","updated":"2023-07-26T03:53:19Z","published":"2023-03-13T04:16:42Z","title":"Scene Graph Generation from Hierarchical Relationship Reasoning","summary":"  This paper presents a novel approach for inferring relationships between\nobjects in visual scenes. It explicitly exploits an informative hierarchical\nstructure that can be imposed to divide the object and relationship categories\ninto disjoint super-categories. Specifically, our proposed method incorporates\na Bayes prediction head, enabling joint predictions of the super-category as\nthe type of relationship between the two objects, along with the detailed\nrelationship within that super-category. This design reduces the impact of\nclass imbalance problems. Furthermore, we also modify the supervised\ncontrastive learning to adapt our hierarchical classification scheme.\nExperimental evaluations on the Visual Genome and OpenImage V6 datasets\ndemonstrate that this factorized approach allows a relatively simple model to\nachieve competitive performance, particularly in predicate classification and\nzero-shot tasks.\n","authors":["Bowen Jiang","Camillo J. Taylor"],"pdf_url":"https://arxiv.org/pdf/2303.06842v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13938v1","updated":"2023-07-26T03:30:28Z","published":"2023-07-26T03:30:28Z","title":"Improving Semi-Supervised Semantic Segmentation with Dual-Level Siamese\n  Structure Network","summary":"  Semi-supervised semantic segmentation (SSS) is an important task that\nutilizes both labeled and unlabeled data to reduce expenses on labeling\ntraining examples. However, the effectiveness of SSS algorithms is limited by\nthe difficulty of fully exploiting the potential of unlabeled data. To address\nthis, we propose a dual-level Siamese structure network (DSSN) for pixel-wise\ncontrastive learning. By aligning positive pairs with a pixel-wise contrastive\nloss using strong augmented views in both low-level image space and high-level\nfeature space, the proposed DSSN is designed to maximize the utilization of\navailable unlabeled data. Additionally, we introduce a novel class-aware\npseudo-label selection strategy for weak-to-strong supervision, which addresses\nthe limitations of most existing methods that do not perform selection or apply\na predefined threshold for all classes. Specifically, our strategy selects the\ntop high-confidence prediction of the weak view for each class to generate\npseudo labels that supervise the strong augmented views. This strategy is\ncapable of taking into account the class imbalance and improving the\nperformance of long-tailed classes. Our proposed method achieves\nstate-of-the-art results on two datasets, PASCAL VOC 2012 and Cityscapes,\noutperforming other SSS algorithms by a significant margin.\n","authors":["Zhibo Tain","Xiaolin Zhang","Peng Zhang","Kun Zhan"],"pdf_url":"https://arxiv.org/pdf/2307.13938v1.pdf","comment":"ACM MM 2023 accpeted"},{"id":"http://arxiv.org/abs/2307.13933v1","updated":"2023-07-26T03:12:05Z","published":"2023-07-26T03:12:05Z","title":"AIDE: A Vision-Driven Multi-View, Multi-Modal, Multi-Tasking Dataset for\n  Assistive Driving Perception","summary":"  Driver distraction has become a significant cause of severe traffic accidents\nover the past decade. Despite the growing development of vision-driven driver\nmonitoring systems, the lack of comprehensive perception datasets restricts\nroad safety and traffic security. In this paper, we present an AssIstive\nDriving pErception dataset (AIDE) that considers context information both\ninside and outside the vehicle in naturalistic scenarios. AIDE facilitates\nholistic driver monitoring through three distinctive characteristics, including\nmulti-view settings of driver and scene, multi-modal annotations of face, body,\nposture, and gesture, and four pragmatic task designs for driving\nunderstanding. To thoroughly explore AIDE, we provide experimental benchmarks\non three kinds of baseline frameworks via extensive methods. Moreover, two\nfusion strategies are introduced to give new insights into learning effective\nmulti-stream/modal representations. We also systematically investigate the\nimportance and rationality of the key components in AIDE and benchmarks. The\nproject link is https://github.com/ydk122024/AIDE.\n","authors":["Dingkang Yang","Shuai Huang","Zhi Xu","Zhenpeng Li","Shunli Wang","Mingcheng Li","Yuzheng Wang","Yang Liu","Kun Yang","Zhaoyu Chen","Yan Wang","Jing Liu","Peixuan Zhang","Peng Zhai","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.13933v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13929v1","updated":"2023-07-26T03:00:31Z","published":"2023-07-26T03:00:31Z","title":"Spatio-Temporal Domain Awareness for Multi-Agent Collaborative\n  Perception","summary":"  Multi-agent collaborative perception as a potential application for\nvehicle-to-everything communication could significantly improve the perception\nperformance of autonomous vehicles over single-agent perception. However,\nseveral challenges remain in achieving pragmatic information sharing in this\nemerging research. In this paper, we propose SCOPE, a novel collaborative\nperception framework that aggregates the spatio-temporal awareness\ncharacteristics across on-road agents in an end-to-end manner. Specifically,\nSCOPE has three distinct strengths: i) it considers effective semantic cues of\nthe temporal context to enhance current representations of the target agent;\nii) it aggregates perceptually critical spatial information from heterogeneous\nagents and overcomes localization errors via multi-scale feature interactions;\niii) it integrates multi-source representations of the target agent based on\ntheir complementary contributions by an adaptive fusion paradigm. To thoroughly\nevaluate SCOPE, we consider both real-world and simulated scenarios of\ncollaborative 3D object detection tasks on three datasets. Extensive\nexperiments demonstrate the superiority of our approach and the necessity of\nthe proposed components.\n","authors":["Kun Yang","Dingkang Yang","Jingyu Zhang","Mingcheng Li","Yang Liu","Jing Liu","Hanqi Wang","Peng Sun","Liang Song"],"pdf_url":"https://arxiv.org/pdf/2307.13929v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13927v1","updated":"2023-07-26T02:53:29Z","published":"2023-07-26T02:53:29Z","title":"DFR-Net: Density Feature Refinement Network for Image Dehazing Utilizing\n  Haze Density Difference","summary":"  In image dehazing task, haze density is a key feature and affects the\nperformance of dehazing methods. However, some of the existing methods lack a\ncomparative image to measure densities, and others create intermediate results\nbut lack the exploitation of their density differences, which can facilitate\nperception of density. To address these deficiencies, we propose a\ndensity-aware dehazing method named Density Feature Refinement Network\n(DFR-Net) that extracts haze density features from density differences and\nleverages density differences to refine density features. In DFR-Net, we first\ngenerate a proposal image that has lower overall density than the hazy input,\nbringing in global density differences. Additionally, the dehazing residual of\nthe proposal image reflects the level of dehazing performance and provides\nlocal density differences that indicate localized hard dehazing or high density\nareas. Subsequently, we introduce a Global Branch (GB) and a Local Branch (LB)\nto achieve density-awareness. In GB, we use Siamese networks for feature\nextraction of hazy inputs and proposal images, and we propose a Global Density\nFeature Refinement (GDFR) module that can refine features by pushing features\nwith different global densities further away. In LB, we explore local density\nfeatures from the dehazing residuals between hazy inputs and proposal images\nand introduce an Intermediate Dehazing Residual Feedforward (IDRF) module to\nupdate local features and pull them closer to clear image features. Sufficient\nexperiments demonstrate that the proposed method achieves results beyond the\nstate-of-the-art methods on various datasets.\n","authors":["Zhongze Wang","Haitao Zhao","Lujian Yao","Jingchao Peng","Kaijie Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.13927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00773v2","updated":"2023-07-26T02:49:44Z","published":"2023-07-03T06:33:49Z","title":"DifFSS: Diffusion Model for Few-Shot Semantic Segmentation","summary":"  Diffusion models have demonstrated excellent performance in image generation.\nAlthough various few-shot semantic segmentation (FSS) models with different\nnetwork structures have been proposed, performance improvement has reached a\nbottleneck. This paper presents the first work to leverage the diffusion model\nfor FSS task, called DifFSS. DifFSS, a novel FSS paradigm, can further improve\nthe performance of the state-of-the-art FSS models by a large margin without\nmodifying their network structure. Specifically, we utilize the powerful\ngeneration ability of diffusion models to generate diverse auxiliary support\nimages by using the semantic mask, scribble or soft HED boundary of the support\nimage as control conditions. This generation process simulates the variety\nwithin the class of the query image, such as color, texture variation,\nlighting, $etc$. As a result, FSS models can refer to more diverse support\nimages, yielding more robust representations, thereby achieving a consistent\nimprovement in segmentation performance. Extensive experiments on three\npublicly available datasets based on existing advanced FSS models demonstrate\nthe effectiveness of the diffusion model for FSS task. Furthermore, we explore\nin detail the impact of different input settings of the diffusion model on\nsegmentation performance. Hopefully, this completely new paradigm will bring\ninspiration to the study of FSS task integrated with AI-generated content.\n","authors":["Weimin Tan","Siyuan Chen","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2307.00773v2.pdf","comment":"code is available at https://github.com/TrinitialChan/DifFSS"},{"id":"http://arxiv.org/abs/2307.13925v1","updated":"2023-07-26T02:46:50Z","published":"2023-07-26T02:46:50Z","title":"EasyNet: An Easy Network for 3D Industrial Anomaly Detection","summary":"  3D anomaly detection is an emerging and vital computer vision task in\nindustrial manufacturing (IM). Recently many advanced algorithms have been\npublished, but most of them cannot meet the needs of IM. There are several\ndisadvantages: i) difficult to deploy on production lines since their\nalgorithms heavily rely on large pre-trained models; ii) hugely increase\nstorage overhead due to overuse of memory banks; iii) the inference speed\ncannot be achieved in real-time. To overcome these issues, we propose an easy\nand deployment-friendly network (called EasyNet) without using pre-trained\nmodels and memory banks: firstly, we design a multi-scale multi-modality\nfeature encoder-decoder to accurately reconstruct the segmentation maps of\nanomalous regions and encourage the interaction between RGB images and depth\nimages; secondly, we adopt a multi-modality anomaly segmentation network to\nachieve a precise anomaly map; thirdly, we propose an attention-based\ninformation entropy fusion module for feature fusion during inference, making\nit suitable for real-time deployment. Extensive experiments show that EasyNet\nachieves an anomaly detection AUROC of 92.6% without using pre-trained models\nand memory banks. In addition, EasyNet is faster than existing methods, with a\nhigh frame rate of 94.55 FPS on a Tesla V100 GPU.\n","authors":["Ruitao Chen","Guoyang Xie","Jiaqi Liu","Jinbao Wang","Ziqi Luo","Jinfan Wang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.13925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13924v1","updated":"2023-07-26T02:45:59Z","published":"2023-07-26T02:45:59Z","title":"trajdata: A Unified Interface to Multiple Human Trajectory Datasets","summary":"  The field of trajectory forecasting has grown significantly in recent years,\npartially owing to the release of numerous large-scale, real-world human\ntrajectory datasets for autonomous vehicles (AVs) and pedestrian motion\ntracking. While such datasets have been a boon for the community, they each use\ncustom and unique data formats and APIs, making it cumbersome for researchers\nto train and evaluate methods across multiple datasets. To remedy this, we\npresent trajdata: a unified interface to multiple human trajectory datasets. At\nits core, trajdata provides a simple, uniform, and efficient representation and\nAPI for trajectory and map data. As a demonstration of its capabilities, in\nthis work we conduct a comprehensive empirical evaluation of existing\ntrajectory datasets, providing users with a rich understanding of the data\nunderpinning much of current pedestrian and AV motion forecasting research, and\nproposing suggestions for future datasets from these insights. trajdata is\npermissively licensed (Apache 2.0) and can be accessed online at\nhttps://github.com/NVlabs/trajdata\n","authors":["Boris Ivanovic","Guanyu Song","Igor Gilitschenski","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2307.13924v1.pdf","comment":"15 pages, 15 figures, 3 tables"},{"id":"http://arxiv.org/abs/2307.04192v2","updated":"2023-07-26T02:29:03Z","published":"2023-07-09T14:54:30Z","title":"SAS Video-QA: Self-Adaptive Sampling for Efficient Video\n  Question-Answering","summary":"  Video question--answering is a fundamental task in the field of video\nunderstanding. Although current vision--language models (VLMs) equipped with\nVideo Transformers have enabled temporal modeling and yielded superior results,\nthey are at the cost of huge computational power and thus too expensive to\ndeploy in real-time application scenarios. An economical workaround only\nsamples a small portion of frames to represent the main content of that video\nand tune an image--text model on these sampled frames. Recent video\nunderstanding models usually randomly sample a set of frames or clips,\nregardless of internal correlations between their visual contents, nor their\nrelevance to the problem. We argue that such kinds of aimless sampling may omit\nthe key frames from which the correct answer can be deduced, and the situation\ngets worse when the sampling sparsity increases, which always happens as the\nvideo lengths increase. To mitigate this issue, we propose two frame sampling\nstrategies, namely the most domain frames (MDF) and most implied frames (MIF),\nto maximally preserve those frames that are most likely vital to the given\nquestions. MDF passively minimizes the risk of key frame omission in a\nbootstrap manner, while MIS actively searches key frames customized for each\nvideo--question pair with the assistance of auxiliary models. The experimental\nresults on three public datasets from three advanced VLMs (CLIP, GIT and\nAll-in-one) demonstrate that our proposed strategies can boost the performance\nfor image--text pretrained models. The source codes pertaining to the method\nproposed in this paper are publicly available at\nhttps://github.com/declare-lab/sas-vqa.\n","authors":["Wei Han","Hui Chen","Min-Yen Kan","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2307.04192v2.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.02115v2","updated":"2023-07-26T02:20:30Z","published":"2023-06-03T14:01:54Z","title":"Table and Image Generation for Investigating Knowledge of Entities in\n  Pre-trained Vision and Language Models","summary":"  In this paper, we propose a table and image generation task to verify how the\nknowledge about entities acquired from natural language is retained in Vision &\nLanguage (V&L) models. This task consists of two parts: the first is to\ngenerate a table containing knowledge about an entity and its related image,\nand the second is to generate an image from an entity with a caption and a\ntable containing related knowledge of the entity. In both tasks, the model must\nknow the entities used to perform the generation properly. We created the\nWikipedia Table and Image Generation (WikiTIG) dataset from about 200,000\ninfoboxes in English Wikipedia articles to perform the proposed tasks. We\nevaluated the performance on the tasks with respect to the above research\nquestion using the V&L model OFA, which has achieved state-of-the-art results\nin multiple tasks. Experimental results show that OFA forgets part of its\nentity knowledge by pre-training as a complement to improve the performance of\nimage related tasks.\n","authors":["Hidetaka Kamigaito","Katsuhiko Hayashi","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2306.02115v2.pdf","comment":"Accepted at ACL 2023"},{"id":"http://arxiv.org/abs/2307.13908v1","updated":"2023-07-26T02:16:55Z","published":"2023-07-26T02:16:55Z","title":"Points-to-3D: Bridging the Gap between Sparse Points and\n  Shape-Controllable Text-to-3D Generation","summary":"  Text-to-3D generation has recently garnered significant attention, fueled by\n2D diffusion models trained on billions of image-text pairs. Existing methods\nprimarily rely on score distillation to leverage the 2D diffusion priors to\nsupervise the generation of 3D models, e.g., NeRF. However, score distillation\nis prone to suffer the view inconsistency problem, and implicit NeRF modeling\ncan also lead to an arbitrary shape, thus leading to less realistic and\nuncontrollable 3D generation. In this work, we propose a flexible framework of\nPoints-to-3D to bridge the gap between sparse yet freely available 3D points\nand realistic shape-controllable 3D generation by distilling the knowledge from\nboth 2D and 3D diffusion models. The core idea of Points-to-3D is to introduce\ncontrollable sparse 3D points to guide the text-to-3D generation. Specifically,\nwe use the sparse point cloud generated from the 3D diffusion model, Point-E,\nas the geometric prior, conditioned on a single reference image. To better\nutilize the sparse 3D points, we propose an efficient point cloud guidance loss\nto adaptively drive the NeRF's geometry to align with the shape of the sparse\n3D points. In addition to controlling the geometry, we propose to optimize the\nNeRF for a more view-consistent appearance. To be specific, we perform score\ndistillation to the publicly available 2D image diffusion model ControlNet,\nconditioned on text as well as depth map of the learned compact geometry.\nQualitative and quantitative comparisons demonstrate that Points-to-3D improves\nview consistency and achieves good shape controllability for text-to-3D\ngeneration. Points-to-3D provides users with a new way to improve and control\ntext-to-3D generation.\n","authors":["Chaohui Yu","Qiang Zhou","Jingliang Li","Zhe Zhang","Zhibin Wang","Fan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.13908v1.pdf","comment":"Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2307.02100v2","updated":"2023-07-26T02:13:29Z","published":"2023-07-05T08:19:29Z","title":"MDViT: Multi-domain Vision Transformer for Small Medical Image\n  Segmentation Datasets","summary":"  Despite its clinical utility, medical image segmentation (MIS) remains a\ndaunting task due to images' inherent complexity and variability. Vision\ntransformers (ViTs) have recently emerged as a promising solution to improve\nMIS; however, they require larger training datasets than convolutional neural\nnetworks. To overcome this obstacle, data-efficient ViTs were proposed, but\nthey are typically trained using a single source of data, which overlooks the\nvaluable knowledge that could be leveraged from other available datasets.\nNaivly combining datasets from different domains can result in negative\nknowledge transfer (NKT), i.e., a decrease in model performance on some domains\nwith non-negligible inter-domain heterogeneity. In this paper, we propose\nMDViT, the first multi-domain ViT that includes domain adapters to mitigate\ndata-hunger and combat NKT by adaptively exploiting knowledge in multiple small\ndata resources (domains). Further, to enhance representation learning across\ndomains, we integrate a mutual knowledge distillation paradigm that transfers\nknowledge between a universal network (spanning all the domains) and auxiliary\ndomain-specific branches. Experiments on 4 skin lesion segmentation datasets\nshow that MDViT outperforms state-of-the-art algorithms, with superior\nsegmentation performance and a fixed model size, at inference time, even as\nmore domains are added. Our code is available at\nhttps://github.com/siyi-wind/MDViT.\n","authors":["Siyi Du","Nourhan Bayasi","Ghassan Harmarneh","Rafeef Garbi"],"pdf_url":"https://arxiv.org/pdf/2307.02100v2.pdf","comment":"10 pages, 2 figures, accepted by 26th International Conference on\n  Medical Image Computing and Computer Assisted Intervention (MICCAI 2023)"},{"id":"http://arxiv.org/abs/2307.07928v3","updated":"2023-07-26T01:59:06Z","published":"2023-07-16T02:44:19Z","title":"Reinforced Disentanglement for Face Swapping without Skip Connection","summary":"  The SOTA face swap models still suffer the problem of either target identity\n(i.e., shape) being leaked or the target non-identity attributes (i.e.,\nbackground, hair) failing to be fully preserved in the final results. We show\nthat this insufficient disentanglement is caused by two flawed designs that\nwere commonly adopted in prior models: (1) counting on only one compressed\nencoder to represent both the semantic-level non-identity facial\nattributes(i.e., pose) and the pixel-level non-facial region details, which is\ncontradictory to satisfy at the same time; (2) highly relying on long\nskip-connections between the encoder and the final generator, leaking a certain\namount of target face identity into the result. To fix them, we introduce a new\nface swap framework called 'WSC-swap' that gets rid of skip connections and\nuses two target encoders to respectively capture the pixel-level non-facial\nregion attributes and the semantic non-identity attributes in the face region.\nTo further reinforce the disentanglement learning for the target encoder, we\nemploy both identity removal loss via adversarial training (i.e., GAN) and the\nnon-identity preservation loss via prior 3DMM models like [11]. Extensive\nexperiments on both FaceForensics++ and CelebA-HQ show that our results\nsignificantly outperform previous works on a rich set of metrics, including one\nnovel metric for measuring identity consistency that was completely neglected\nbefore.\n","authors":["Xiaohang Ren","Xingyu Chen","Pengfei Yao","Heung-Yeung Shum","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.07928v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.13901v1","updated":"2023-07-26T01:51:10Z","published":"2023-07-26T01:51:10Z","title":"YOLOBench: Benchmarking Efficient Object Detectors on Embedded Systems","summary":"  We present YOLOBench, a benchmark comprised of 550+ YOLO-based object\ndetection models on 4 different datasets and 4 different embedded hardware\nplatforms (x86 CPU, ARM CPU, Nvidia GPU, NPU). We collect accuracy and latency\nnumbers for a variety of YOLO-based one-stage detectors at different model\nscales by performing a fair, controlled comparison of these detectors with a\nfixed training environment (code and training hyperparameters).\nPareto-optimality analysis of the collected data reveals that, if modern\ndetection heads and training techniques are incorporated into the learning\nprocess, multiple architectures of the YOLO series achieve a good\naccuracy-latency trade-off, including older models like YOLOv3 and YOLOv4. We\nalso evaluate training-free accuracy estimators used in neural architecture\nsearch on YOLOBench and demonstrate that, while most state-of-the-art zero-cost\naccuracy estimators are outperformed by a simple baseline like MAC count, some\nof them can be effectively used to predict Pareto-optimal detection models. We\nshowcase that by using a zero-cost proxy to identify a YOLO architecture\ncompetitive against a state-of-the-art YOLOv8 model on a Raspberry Pi 4 CPU.\nThe code and data are available at\nhttps://github.com/Deeplite/deeplite-torch-zoo\n","authors":["Ivan Lazarevich","Matteo Grimaldi","Ravish Kumar","Saptarshi Mitra","Shahrukh Khan","Sudhakar Sah"],"pdf_url":"https://arxiv.org/pdf/2307.13901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13899v1","updated":"2023-07-26T01:47:49Z","published":"2023-07-26T01:47:49Z","title":"Regularizing Neural Networks with Meta-Learning Generative Models","summary":"  This paper investigates methods for improving generative data augmentation\nfor deep learning. Generative data augmentation leverages the synthetic samples\nproduced by generative models as an additional dataset for classification with\nsmall dataset settings. A key challenge of generative data augmentation is that\nthe synthetic data contain uninformative samples that degrade accuracy. This is\nbecause the synthetic samples do not perfectly represent class categories in\nreal data and uniform sampling does not necessarily provide useful samples for\ntasks. In this paper, we present a novel strategy for generative data\naugmentation called meta generative regularization (MGR). To avoid the\ndegradation of generative data augmentation, MGR utilizes synthetic samples in\nthe regularization term for feature extractors instead of in the loss function,\ne.g., cross-entropy. These synthetic samples are dynamically determined to\nminimize the validation losses through meta-learning. We observed that MGR can\navoid the performance degradation of na\\\"ive generative data augmentation and\nboost the baselines. Experiments on six datasets showed that MGR is effective\nparticularly when datasets are smaller and stably outperforms baselines.\n","authors":["Shin'ya Yamaguchi","Daiki Chijiwa","Sekitoshi Kanai","Atsutoshi Kumagai","Hisashi Kashima"],"pdf_url":"https://arxiv.org/pdf/2307.13899v1.pdf","comment":"Accepted to Data-centric Machine Learning Research (DMLR) Workshop at\n  ICML 2023"},{"id":"http://arxiv.org/abs/2206.13803v3","updated":"2023-07-26T01:46:05Z","published":"2022-06-28T07:37:38Z","title":"FedIIC: Towards Robust Federated Learning for Class-Imbalanced Medical\n  Image Classification","summary":"  Federated learning (FL), training deep models from decentralized data without\nprivacy leakage, has shown great potential in medical image computing recently.\nHowever, considering the ubiquitous class imbalance in medical data, FL can\nexhibit performance degradation, especially for minority classes (e.g. rare\ndiseases). Existing methods towards this problem mainly focus on training a\nbalanced classifier to eliminate class prior bias among classes, but neglect to\nexplore better representation to facilitate classification performance. In this\npaper, we present a privacy-preserving FL method named FedIIC to combat class\nimbalance from two perspectives: feature learning and classifier learning. In\nfeature learning, two levels of contrastive learning are designed to extract\nbetter class-specific features with imbalanced data in FL. In classifier\nlearning, per-class margins are dynamically set according to real-time\ndifficulty and class priors, which helps the model learn classes equally.\nExperimental results on publicly-available datasets demonstrate the superior\nperformance of FedIIC in dealing with both real-world and simulated\nmulti-source medical imaging data under class imbalance. Code is available at\nhttps://github.com/wnn2000/FedIIC.\n","authors":["Nannan Wu","Li Yu","Xin Yang","Kwang-Ting Cheng","Zengqiang Yan"],"pdf_url":"https://arxiv.org/pdf/2206.13803v3.pdf","comment":"This paper has been accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.13897v1","updated":"2023-07-26T01:44:31Z","published":"2023-07-26T01:44:31Z","title":"AViT: Adapting Vision Transformers for Small Skin Lesion Segmentation\n  Datasets","summary":"  Skin lesion segmentation (SLS) plays an important role in skin lesion\nanalysis. Vision transformers (ViTs) are considered an auspicious solution for\nSLS, but they require more training data compared to convolutional neural\nnetworks (CNNs) due to their inherent parameter-heavy structure and lack of\nsome inductive biases. To alleviate this issue, current approaches fine-tune\npre-trained ViT backbones on SLS datasets, aiming to leverage the knowledge\nlearned from a larger set of natural images to lower the amount of skin\ntraining data needed. However, fully fine-tuning all parameters of large\nbackbones is computationally expensive and memory intensive. In this paper, we\npropose AViT, a novel efficient strategy to mitigate ViTs' data-hunger by\ntransferring any pre-trained ViTs to the SLS task. Specifically, we integrate\nlightweight modules (adapters) within the transformer layers, which modulate\nthe feature representation of a ViT without updating its pre-trained weights.\nIn addition, we employ a shallow CNN as a prompt generator to create a prompt\nembedding from the input image, which grasps fine-grained information and CNN's\ninductive biases to guide the segmentation task on small datasets. Our\nquantitative experiments on 4 skin lesion datasets demonstrate that AViT\nachieves competitive, and at times superior, performance to SOTA but with\nsignificantly fewer trainable parameters. Our code is available at\nhttps://github.com/siyi-wind/AViT.\n","authors":["Siyi Du","Nourhan Bayasi","Ghassan Harmarneh","Rafeef Garbi"],"pdf_url":"https://arxiv.org/pdf/2307.13897v1.pdf","comment":"10 pages, 2 figures, accepted by MICCAI ISIC Workshop 2023"},{"id":"http://arxiv.org/abs/2202.09559v2","updated":"2023-07-26T01:36:38Z","published":"2022-02-19T09:30:08Z","title":"Priming Cross-Session Motor Imagery Classification with A Universal Deep\n  Domain Adaptation Framework","summary":"  Motor imagery (MI) is a common brain computer interface (BCI) paradigm. EEG\nis non-stationary with low signal-to-noise, classifying motor imagery tasks of\nthe same participant from different EEG recording sessions is generally\nchallenging, as EEG data distribution may vary tremendously among different\nacquisition sessions. Although it is intuitive to consider the cross-session MI\nclassification as a domain adaptation problem, the rationale and feasible\napproach is not elucidated. In this paper, we propose a Siamese deep domain\nadaptation (SDDA) framework for cross-session MI classification based on\nmathematical models in domain adaptation theory. The proposed framework can be\neasily applied to most existing artificial neural networks without altering the\nnetwork structure, which facilitates our method with great flexibility and\ntransferability. In the proposed framework, domain invariants were firstly\nconstructed jointly with channel normalization and Euclidean alignment. Then,\nembedding features from source and target domain were mapped into the\nReproducing Kernel Hilbert Space (RKHS) and aligned accordingly. A cosine-based\ncenter loss was also integrated into the framework to improve the\ngeneralizability of the SDDA. The proposed framework was validated with two\nclassic and popular convolutional neural networks from BCI research field\n(EEGNet and ConvNet) in two MI-EEG public datasets (BCI Competition IV IIA,\nIIB). Compared to the vanilla EEGNet and ConvNet, the proposed SDDA framework\nwas able to boost the MI classification accuracy by 15.2%, 10.2% respectively\nin IIA dataset, and 5.5%, 4.2% in IIB dataset. The final MI classification\naccuracy reached 82.01% in IIA dataset and 87.52% in IIB, which outperformed\nthe state-of-the-art methods in the literature.\n","authors":["Zhengqing Miao","Xin Zhang","Carlo Menon","Yelong Zheng","Meirong Zhao","Dong Ming"],"pdf_url":"https://arxiv.org/pdf/2202.09559v2.pdf","comment":"17 pages, 5figures"},{"id":"http://arxiv.org/abs/2203.01482v2","updated":"2023-07-26T00:49:29Z","published":"2022-03-03T01:53:47Z","title":"MetaDT: Meta Decision Tree with Class Hierarchy for Interpretable\n  Few-Shot Learning","summary":"  Few-Shot Learning (FSL) is a challenging task, which aims to recognize novel\nclasses with few examples. Recently, lots of methods have been proposed from\nthe perspective of meta-learning and representation learning. However, few\nworks focus on the interpretability of FSL decision process. In this paper, we\ntake a step towards the interpretable FSL by proposing a novel meta-learning\nbased decision tree framework, namely, MetaDT. In particular, the FSL\ninterpretability is achieved from two aspects, i.e., a concept aspect and a\nvisual aspect. On the concept aspect, we first introduce a tree-like concept\nhierarchy as FSL prior. Then, resorting to the prior, we split each few-shot\ntask to a set of subtasks with different concept levels and then perform class\nprediction via a model of decision tree. The advantage of such design is that a\nsequence of high-level concept decisions that lead up to a final class\nprediction can be obtained, which clarifies the FSL decision process. On the\nvisual aspect, a set of subtask-specific classifiers with visual attention\nmechanism is designed to perform decision at each node of the decision tree. As\na result, a subtask-specific heatmap visualization can be obtained to achieve\nthe decision interpretability of each tree node. At last, to alleviate the data\nscarcity issue of FSL, we regard the prior of concept hierarchy as an\nundirected graph, and then design a graph convolution-based decision tree\ninference network as our meta-learner to infer parameters of the decision tree.\nExtensive experiments on performance comparison and interpretability analysis\nshow superiority of our MetaDT.\n","authors":["Baoquan Zhang","Hao Jiang","Xutao Li","Shanshan Feng","Yunming Ye","Rui Ye"],"pdf_url":"https://arxiv.org/pdf/2203.01482v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.12917v2","updated":"2023-07-26T00:05:59Z","published":"2023-07-24T16:18:22Z","title":"Hierarchical Skeleton Meta-Prototype Contrastive Learning with Hard\n  Skeleton Mining for Unsupervised Person Re-Identification","summary":"  With rapid advancements in depth sensors and deep learning, skeleton-based\nperson re-identification (re-ID) models have recently achieved remarkable\nprogress with many advantages. Most existing solutions learn single-level\nskeleton features from body joints with the assumption of equal skeleton\nimportance, while they typically lack the ability to exploit more informative\nskeleton features from various levels such as limb level with more global body\npatterns. The label dependency of these methods also limits their flexibility\nin learning more general skeleton representations. This paper proposes a\ngeneric unsupervised Hierarchical skeleton Meta-Prototype Contrastive learning\n(Hi-MPC) approach with Hard Skeleton Mining (HSM) for person re-ID with\nunlabeled 3D skeletons. Firstly, we construct hierarchical representations of\nskeletons to model coarse-to-fine body and motion features from the levels of\nbody joints, components, and limbs. Then a hierarchical meta-prototype\ncontrastive learning model is proposed to cluster and contrast the most typical\nskeleton features (\"prototypes\") from different-level skeletons. By converting\noriginal prototypes into meta-prototypes with multiple homogeneous\ntransformations, we induce the model to learn the inherent consistency of\nprototypes to capture more effective skeleton features for person re-ID.\nFurthermore, we devise a hard skeleton mining mechanism to adaptively infer the\ninformative importance of each skeleton, so as to focus on harder skeletons to\nlearn more discriminative skeleton representations. Extensive evaluations on\nfive datasets demonstrate that our approach outperforms a wide variety of\nstate-of-the-art skeleton-based methods. We further show the general\napplicability of our method to cross-view person re-ID and RGB-based scenarios\nwith estimated skeletons.\n","authors":["Haocong Rao","Cyril Leung","Chunyan Miao"],"pdf_url":"https://arxiv.org/pdf/2307.12917v2.pdf","comment":"Accepted by International Journal of Computer Vision (IJCV). Codes\n  are available at https://github.com/Kali-Hac/Hi-MPC. Supplemental materials\n  will be included in the published version"},{"id":"http://arxiv.org/abs/2305.05077v2","updated":"2023-07-26T23:57:23Z","published":"2023-05-08T22:35:07Z","title":"Atmospheric Turbulence Correction via Variational Deep Diffusion","summary":"  Atmospheric Turbulence (AT) correction is a challenging restoration task as\nit consists of two distortions: geometric distortion and spatially variant\nblur. Diffusion models have shown impressive accomplishments in photo-realistic\nimage synthesis and beyond. In this paper, we propose a novel deep conditional\ndiffusion model under a variational inference framework to solve the AT\ncorrection problem. We use this framework to improve performance by learning\nlatent prior information from the input and degradation processes. We use the\nlearned information to further condition the diffusion model. Experiments are\nconducted in a comprehensive synthetic AT dataset. We show that the proposed\nframework achieves good quantitative and qualitative results.\n","authors":["Xijun Wang","Santiago López-Tapia","Aggelos K. Katsaggelos"],"pdf_url":"https://arxiv.org/pdf/2305.05077v2.pdf","comment":"This work has been accepted to the 2023 IEEE 6th International\n  Conference on Multimedia Information Processing and Retrieval (MIPR)"},{"id":"http://arxiv.org/abs/2305.04186v2","updated":"2023-07-26T23:10:51Z","published":"2023-05-07T04:18:22Z","title":"Video-Specific Query-Key Attention Modeling for Weakly-Supervised\n  Temporal Action Localization","summary":"  Weakly-supervised temporal action localization aims to identify and localize\nthe action instances in the untrimmed videos with only video-level action\nlabels. When humans watch videos, we can adapt our abstract-level knowledge\nabout actions in different video scenarios and detect whether some actions are\noccurring. In this paper, we mimic how humans do and bring a new perspective\nfor locating and identifying multiple actions in a video. We propose a network\nnamed VQK-Net with a video-specific query-key attention modeling that learns a\nunique query for each action category of each input video. The learned queries\nnot only contain the actions' knowledge features at the abstract level but also\nhave the ability to fit this knowledge into the target video scenario, and they\nwill be used to detect the presence of the corresponding action along the\ntemporal dimension. To better learn these action category queries, we exploit\nnot only the features of the current input video but also the correlation\nbetween different videos through a novel video-specific action category query\nlearner worked with a query similarity loss. Finally, we conduct extensive\nexperiments on three commonly used datasets (THUMOS14, ActivityNet1.2, and\nActivityNet1.3) and achieve state-of-the-art performance.\n","authors":["Xijun Wang","Aggelos K. Katsaggelos"],"pdf_url":"https://arxiv.org/pdf/2305.04186v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2005.00695v3","updated":"2023-07-26T22:58:34Z","published":"2020-05-02T04:10:21Z","title":"On the Generalization Effects of Linear Transformations in Data\n  Augmentation","summary":"  Data augmentation is a powerful technique to improve performance in\napplications such as image and text classification tasks. Yet, there is little\nrigorous understanding of why and how various augmentations work. In this work,\nwe consider a family of linear transformations and study their effects on the\nridge estimator in an over-parametrized linear regression setting. First, we\nshow that transformations that preserve the labels of the data can improve\nestimation by enlarging the span of the training data. Second, we show that\ntransformations that mix data can improve estimation by playing a\nregularization effect. Finally, we validate our theoretical insights on MNIST.\nBased on the insights, we propose an augmentation scheme that searches over the\nspace of transformations by how uncertain the model is about the transformed\ndata. We validate our proposed scheme on image and text datasets. For example,\nour method outperforms random sampling methods by 1.24% on CIFAR-100 using\nWide-ResNet-28-10. Furthermore, we achieve comparable accuracy to the SoTA\nAdversarial AutoAugment on CIFAR-10, CIFAR-100, SVHN, and ImageNet datasets.\n","authors":["Sen Wu","Hongyang R. Zhang","Gregory Valiant","Christopher Ré"],"pdf_url":"https://arxiv.org/pdf/2005.00695v3.pdf","comment":"22 pages. Appeared in ICML 2020"},{"id":"http://arxiv.org/abs/2307.14527v1","updated":"2023-07-26T22:09:29Z","published":"2023-07-26T22:09:29Z","title":"Open Problems in Computer Vision for Wilderness SAR and The Search for\n  Patricia Wu-Murad","summary":"  This paper details the challenges in applying two computer vision systems, an\nEfficientDET supervised learning model and the unsupervised RX spectral\nclassifier, to 98.9 GB of drone imagery from the Wu-Murad wilderness search and\nrescue (WSAR) effort in Japan and identifies 3 directions for future research.\nThere have been at least 19 proposed approaches and 3 datasets aimed at\nlocating missing persons in drone imagery, but only 3 approaches (2\nunsupervised and 1 of an unknown structure) are referenced in the literature as\nhaving been used in an actual WSAR operation. Of these proposed approaches, the\nEfficientDET architecture and the unsupervised spectral RX classifier were\nselected as the most appropriate for this setting. The EfficientDET model was\napplied to the HERIDAL dataset and despite achieving performance that is\nstatistically equivalent to the state-of-the-art, the model fails to translate\nto the real world in terms of false positives (e.g., identifying tree limbs and\nrocks as people), and false negatives (e.g., failing to identify members of the\nsearch team). The poor results in practice for algorithms that showed good\nresults on datasets suggest 3 areas of future research: more realistic datasets\nfor wilderness SAR, computer vision models that are capable of seamlessly\nhandling the variety of imagery that can be collected during actual WSAR\noperations, and better alignment on performance measures.\n","authors":["Thomas Manzini","Robin Murphy"],"pdf_url":"https://arxiv.org/pdf/2307.14527v1.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2208.12300v2","updated":"2023-07-26T22:04:46Z","published":"2022-08-25T18:40:45Z","title":"A Deep Perceptual Measure for Lens and Camera Calibration","summary":"  Image editing and compositing have become ubiquitous in entertainment, from\ndigital art to AR and VR experiences. To produce beautiful composites, the\ncamera needs to be geometrically calibrated, which can be tedious and requires\na physical calibration target. In place of the traditional multi-image\ncalibration process, we propose to infer the camera calibration parameters such\nas pitch, roll, field of view, and lens distortion directly from a single image\nusing a deep convolutional neural network. We train this network using\nautomatically generated samples from a large-scale panorama dataset, yielding\ncompetitive accuracy in terms of standard `2 error. However, we argue that\nminimizing such standard error metrics might not be optimal for many\napplications. In this work, we investigate human sensitivity to inaccuracies in\ngeometric camera calibration. To this end, we conduct a large-scale human\nperception study where we ask participants to judge the realism of 3D objects\ncomposited with correct and biased camera calibration parameters. Based on this\nstudy, we develop a new perceptual measure for camera calibration and\ndemonstrate that our deep calibration network outperforms previous single-image\nbased calibration methods both on standard metrics as well as on this novel\nperceptual measure. Finally, we demonstrate the use of our calibration network\nfor several applications, including virtual object insertion, image retrieval,\nand compositing. A demonstration of our approach is available at\nhttps://lvsn.github.io/deepcalib .\n","authors":["Yannick Hold-Geoffroy","Dominique Piché-Meunier","Kalyan Sunkavalli","Jean-Charles Bazin","François Rameau","Jean-François Lalonde"],"pdf_url":"https://arxiv.org/pdf/2208.12300v2.pdf","comment":"12 pages, 12 figures, project page (including live demo) available at\n  https://lvsn.github.io/deepcalib. arXiv admin note: text overlap with\n  arXiv:1712.01259"},{"id":"http://arxiv.org/abs/2307.14523v1","updated":"2023-07-26T21:55:40Z","published":"2023-07-26T21:55:40Z","title":"Towards multi-modal anatomical landmark detection for ultrasound-guided\n  brain tumor resection with contrastive learning","summary":"  Homologous anatomical landmarks between medical scans are instrumental in\nquantitative assessment of image registration quality in various clinical\napplications, such as MRI-ultrasound registration for tissue shift correction\nin ultrasound-guided brain tumor resection. While manually identified landmark\npairs between MRI and ultrasound (US) have greatly facilitated the validation\nof different registration algorithms for the task, the procedure requires\nsignificant expertise, labor, and time, and can be prone to inter- and\nintra-rater inconsistency. So far, many traditional and machine learning\napproaches have been presented for anatomical landmark detection, but they\nprimarily focus on mono-modal applications. Unfortunately, despite the clinical\nneeds, inter-modal/contrast landmark detection has very rarely been attempted.\nTherefore, we propose a novel contrastive learning framework to detect\ncorresponding landmarks between MRI and intra-operative US scans in\nneurosurgery. Specifically, two convolutional neural networks were trained\njointly to encode image features in MRI and US scans to help match the US image\npatch that contain the corresponding landmarks in the MRI. We developed and\nvalidated the technique using the public RESECT database. With a mean landmark\ndetection accuracy of 5.88+-4.79 mm against 18.78+-4.77 mm with SIFT features,\nthe proposed method offers promising results for MRI-US landmark detection in\nneurosurgical applications for the first time.\n","authors":["Soorena Salari","Amirhossein Rasoulian","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.14523v1.pdf","comment":"Accepted in MICCAI 2023"},{"id":"http://arxiv.org/abs/2207.00052v3","updated":"2023-07-26T21:55:11Z","published":"2022-06-30T18:35:00Z","title":"Visual Pre-training for Navigation: What Can We Learn from Noise?","summary":"  One powerful paradigm in visual navigation is to predict actions from\nobservations directly. Training such an end-to-end system allows\nrepresentations useful for downstream tasks to emerge automatically. However,\nthe lack of inductive bias makes this system data inefficient. We hypothesize a\nsufficient representation of the current view and the goal view for a\nnavigation policy can be learned by predicting the location and size of a crop\nof the current view that corresponds to the goal. We further show that training\nsuch random crop prediction in a self-supervised fashion purely on synthetic\nnoise images transfers well to natural home images. The learned representation\ncan then be bootstrapped to learn a navigation policy efficiently with little\ninteraction data. The code is available at https://yanweiw.github.io/noise2ptz\n","authors":["Yanwei Wang","Ching-Yun Ko","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2207.00052v3.pdf","comment":"IROS 2023"},{"id":"http://arxiv.org/abs/2307.14521v1","updated":"2023-07-26T21:48:14Z","published":"2023-07-26T21:48:14Z","title":"Patterns of Vehicle Lights: Addressing Complexities in Curation and\n  Annotation of Camera-Based Vehicle Light Datasets and Metrics","summary":"  This paper explores the representation of vehicle lights in computer vision\nand its implications for various tasks in the field of autonomous driving.\nDifferent specifications for representing vehicle lights, including bounding\nboxes, center points, corner points, and segmentation masks, are discussed in\nterms of their strengths and weaknesses. Three important tasks in autonomous\ndriving that can benefit from vehicle light detection are identified: nighttime\nvehicle detection, 3D vehicle orientation estimation, and dynamic trajectory\ncues. Each task may require a different representation of the light. The\nchallenges of collecting and annotating large datasets for training data-driven\nmodels are also addressed, leading to introduction of the LISA Vehicle Lights\nDataset and associated Light Visibility Model, which provides light annotations\nspecifically designed for downstream applications in vehicle detection, intent\nand trajectory prediction, and safe path planning. A comparison of existing\nvehicle light datasets is provided, highlighting the unique features and\nlimitations of each dataset. Overall, this paper provides insights into the\nrepresentation of vehicle lights and the importance of accurate annotations for\ntraining effective detection models in autonomous driving applications. Our\ndataset and model are made available at\nhttps://cvrr.ucsd.edu/vehicle-lights-dataset\n","authors":["Ross Greer","Akshay Gopalkrishnan","Maitrayee Keskar","Mohan Trivedi"],"pdf_url":"https://arxiv.org/pdf/2307.14521v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14520v1","updated":"2023-07-26T21:42:22Z","published":"2023-07-26T21:42:22Z","title":"FocalErrorNet: Uncertainty-aware focal modulation network for\n  inter-modal registration error estimation in ultrasound-guided neurosurgery","summary":"  In brain tumor resection, accurate removal of cancerous tissues while\npreserving eloquent regions is crucial to the safety and outcomes of the\ntreatment. However, intra-operative tissue deformation (called brain shift) can\nmove the surgical target and render the pre-surgical plan invalid.\nIntra-operative ultrasound (iUS) has been adopted to provide real-time images\nto track brain shift, and inter-modal (i.e., MRI-iUS) registration is often\nrequired to update the pre-surgical plan. Quality control for the registration\nresults during surgery is important to avoid adverse outcomes, but manual\nverification faces great challenges due to difficult 3D visualization and the\nlow contrast of iUS. Automatic algorithms are urgently needed to address this\nissue, but the problem was rarely attempted. Therefore, we propose a novel deep\nlearning technique based on 3D focal modulation in conjunction with uncertainty\nestimation to accurately assess MRI-iUS registration errors for brain tumor\nsurgery. Developed and validated with the public RESECT clinical database, the\nresulting algorithm can achieve an estimation error of 0.59+-0.57 mm.\n","authors":["Soorena Salari","Amirhossein Rasoulian","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.14520v1.pdf","comment":"Accepted in MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.14517v1","updated":"2023-07-26T21:33:35Z","published":"2023-07-26T21:33:35Z","title":"The Co-12 Recipe for Evaluating Interpretable Part-Prototype Image\n  Classifiers","summary":"  Interpretable part-prototype models are computer vision models that are\nexplainable by design. The models learn prototypical parts and recognise these\ncomponents in an image, thereby combining classification and explanation.\nDespite the recent attention for intrinsically interpretable models, there is\nno comprehensive overview on evaluating the explanation quality of\ninterpretable part-prototype models. Based on the Co-12 properties for\nexplanation quality as introduced in arXiv:2201.08164 (e.g., correctness,\ncompleteness, compactness), we review existing work that evaluates\npart-prototype models, reveal research gaps and outline future approaches for\nevaluation of the explanation quality of part-prototype models. This paper,\ntherefore, contributes to the progression and maturity of this relatively new\nresearch field on interpretable part-prototype models. We additionally provide\na ``Co-12 cheat sheet'' that acts as a concise summary of our findings on\nevaluating part-prototype models.\n","authors":["Meike Nauta","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2307.14517v1.pdf","comment":"24 pages, 1 image, accepted at the 1st World Conference on\n  eXplainable Artificial Intelligence (xAI 2023)"},{"id":"http://arxiv.org/abs/2307.14489v1","updated":"2023-07-26T20:28:58Z","published":"2023-07-26T20:28:58Z","title":"SuperInpaint: Learning Detail-Enhanced Attentional Implicit\n  Representation for Super-resolutional Image Inpainting","summary":"  In this work, we introduce a challenging image restoration task, referred to\nas SuperInpaint, which aims to reconstruct missing regions in low-resolution\nimages and generate completed images with arbitrarily higher resolutions. We\nhave found that this task cannot be effectively addressed by stacking\nstate-of-the-art super-resolution and image inpainting methods as they amplify\neach other's flaws, leading to noticeable artifacts. To overcome these\nlimitations, we propose the detail-enhanced attentional implicit representation\n(DEAR) that can achieve SuperInpaint with a single model, resulting in\nhigh-quality completed images with arbitrary resolutions. Specifically, we use\na deep convolutional network to extract the latent embedding of an input image\nand then enhance the high-frequency components of the latent embedding via an\nadaptive high-pass filter. This leads to detail-enhanced semantic embedding. We\nfurther feed the semantic embedding into an unmask-attentional module that\nsuppresses embeddings from ineffective masked pixels. Additionally, we extract\na pixel-wise importance map that indicates which pixels should be used for\nimage reconstruction. Given the coordinates of a pixel we want to reconstruct,\nwe first collect its neighboring pixels in the input image and extract their\ndetail-enhanced semantic embeddings, unmask-attentional semantic embeddings,\nimportance values, and spatial distances to the desired pixel. Then, we feed\nall the above terms into an implicit representation and generate the color of\nthe specified pixel. To evaluate our method, we extend three existing datasets\nfor this new task and build 18 meaningful baselines using SOTA inpainting and\nsuper-resolution methods. Extensive experimental results demonstrate that our\nmethod outperforms all existing methods by a significant margin on four widely\nused metrics.\n","authors":["Canyu Zhang","Qing Guo","Xiaoguang Li","Renjie Wan","Hongkai Yu","Ivor Tsang","Song Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14489v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14487v1","updated":"2023-07-26T20:25:29Z","published":"2023-07-26T20:25:29Z","title":"Technical note: ShinyAnimalCV: open-source cloud-based web application\n  for object detection, segmentation, and three-dimensional visualization of\n  animals using computer vision","summary":"  Computer vision (CV), a non-intrusive and cost-effective technology, has\nfurthered the development of precision livestock farming by enabling optimized\ndecision-making through timely and individualized animal care. The availability\nof affordable two- and three-dimensional camera sensors, combined with various\nmachine learning and deep learning algorithms, has provided a valuable\nopportunity to improve livestock production systems. However, despite the\navailability of various CV tools in the public domain, applying these tools to\nanimal data can be challenging, often requiring users to have programming and\ndata analysis skills, as well as access to computing resources. Moreover, the\nrapid expansion of precision livestock farming is creating a growing need to\neducate and train animal science students in CV. This presents educators with\nthe challenge of efficiently demonstrating the complex algorithms involved in\nCV. Thus, the objective of this study was to develop ShinyAnimalCV, an\nopen-source cloud-based web application. This application provides a\nuser-friendly interface for performing CV tasks, including object segmentation,\ndetection, three-dimensional surface visualization, and extraction of two- and\nthree-dimensional morphological features. Nine pre-trained CV models using\ntop-view animal data are included in the application. ShinyAnimalCV has been\ndeployed online using cloud computing platforms. The source code of\nShinyAnimalCV is available on GitHub, along with detailed documentation on\ntraining CV models using custom data and deploying ShinyAnimalCV locally to\nallow users to fully leverage the capabilities of the application.\nShinyAnimalCV can contribute to CV research and teaching in the animal science\ncommunity.\n","authors":["Jin Wang","Yu Hu","Lirong Xiang","Gota Morota","Samantha A. Brooks","Carissa L. Wickens","Emily K. Miller-Cushon","Haipeng Yu"],"pdf_url":"https://arxiv.org/pdf/2307.14487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14482v1","updated":"2023-07-26T20:15:19Z","published":"2023-07-26T20:15:19Z","title":"Role of Image Acquisition and Patient Phenotype Variations in Automatic\n  Segmentation Model Generalization","summary":"  Purpose: This study evaluated the out-of-domain performance and\ngeneralization capabilities of automated medical image segmentation models,\nwith a particular focus on adaptation to new image acquisitions and disease\ntype.\n  Materials: Datasets from both non-contrast and contrast-enhanced abdominal CT\nscans of healthy patients and those with polycystic kidney disease (PKD) were\nused. A total of 400 images (100 non-contrast controls, 100 contrast controls,\n100 non-contrast PKD, 100 contrast PKD) were utilized for training/validation\nof models to segment kidneys, livers, and spleens, and the final models were\nthen tested on 100 non-contrast CT images of patients affected by PKD.\nPerformance was evaluated using Dice, Jaccard, TPR, and Precision.\n  Results: Models trained on a diverse range of data showed no worse\nperformance than models trained exclusively on in-domain data when tested on\nin-domain data. For instance, the Dice similarity of the model trained on 25%\nfrom each dataset was found to be non-inferior to the model trained purely on\nin-domain data.\n  Conclusions: The results indicate that broader training examples\nsignificantly enhances model generalization and out-of-domain performance,\nthereby improving automated segmentation tools' applicability in clinical\nsettings. The study's findings provide a roadmap for future research to adopt a\ndata-centric approach in medical image AI model development.\n","authors":["Timothy L. Kline","Sumana Ramanathan","Harrison C. Gottlich","Panagiotis Korfiatis","Adriana V. Gregory"],"pdf_url":"https://arxiv.org/pdf/2307.14482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14460v1","updated":"2023-07-26T19:01:49Z","published":"2023-07-26T19:01:49Z","title":"MiDaS v3.1 -- A Model Zoo for Robust Monocular Relative Depth Estimation","summary":"  We release MiDaS v3.1 for monocular depth estimation, offering a variety of\nnew models based on different encoder backbones. This release is motivated by\nthe success of transformers in computer vision, with a large variety of\npretrained vision transformers now available. We explore how using the most\npromising vision transformers as image encoders impacts depth estimation\nquality and runtime of the MiDaS architecture. Our investigation also includes\nrecent convolutional approaches that achieve comparable quality to vision\ntransformers in image classification tasks. While the previous release MiDaS\nv3.0 solely leverages the vanilla vision transformer ViT, MiDaS v3.1 offers\nadditional models based on BEiT, Swin, SwinV2, Next-ViT and LeViT. These models\noffer different performance-runtime tradeoffs. The best model improves the\ndepth estimation quality by 28% while efficient models enable downstream tasks\nrequiring high frame rates. We also describe the general process for\nintegrating new backbones. A video summarizing the work can be found at\nhttps://youtu.be/UjaeNNFf9sE and the code is available at\nhttps://github.com/isl-org/MiDaS.\n","authors":["Reiner Birkl","Diana Wofk","Matthias Müller"],"pdf_url":"https://arxiv.org/pdf/2307.14460v1.pdf","comment":"14 pages, 2 figures"},{"id":"http://arxiv.org/abs/2211.08944v3","updated":"2023-07-26T18:39:17Z","published":"2022-11-16T14:49:10Z","title":"Reasons for the Superiority of Stochastic Estimators over Deterministic\n  Ones: Robustness, Consistency and Perceptual Quality","summary":"  Stochastic restoration algorithms allow to explore the space of solutions\nthat correspond to the degraded input. In this paper we reveal additional\nfundamental advantages of stochastic methods over deterministic ones, which\nfurther motivate their use. First, we prove that any restoration algorithm that\nattains perfect perceptual quality and whose outputs are consistent with the\ninput must be a posterior sampler, and is thus required to be stochastic.\nSecond, we illustrate that while deterministic restoration algorithms may\nattain high perceptual quality, this can be achieved only by filling up the\nspace of all possible source images using an extremely sensitive mapping, which\nmakes them highly vulnerable to adversarial attacks. Indeed, we show that\nenforcing deterministic models to be robust to such attacks profoundly hinders\ntheir perceptual quality, while robustifying stochastic models hardly\ninfluences their perceptual quality, and improves their output variability.\nThese findings provide a motivation to foster progress in stochastic\nrestoration methods, paving the way to better recovery algorithms.\n","authors":["Guy Ohayon","Theo Adrai","Michael Elad","Tomer Michaeli"],"pdf_url":"https://arxiv.org/pdf/2211.08944v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14446v1","updated":"2023-07-26T18:33:30Z","published":"2023-07-26T18:33:30Z","title":"Self-supervised Few-shot Learning for Semantic Segmentation: An\n  Annotation-free Approach","summary":"  Few-shot semantic segmentation (FSS) offers immense potential in the field of\nmedical image analysis, enabling accurate object segmentation with limited\ntraining data. However, existing FSS techniques heavily rely on annotated\nsemantic classes, rendering them unsuitable for medical images due to the\nscarcity of annotations. To address this challenge, multiple contributions are\nproposed: First, inspired by spectral decomposition methods, the problem of\nimage decomposition is reframed as a graph partitioning task. The eigenvectors\nof the Laplacian matrix, derived from the feature affinity matrix of\nself-supervised networks, are analyzed to estimate the distribution of the\nobjects of interest from the support images. Secondly, we propose a novel\nself-supervised FSS framework that does not rely on any annotation. Instead, it\nadaptively estimates the query mask by leveraging the eigenvectors obtained\nfrom the support images. This approach eliminates the need for manual\nannotation, making it particularly suitable for medical images with limited\nannotated data. Thirdly, to further enhance the decoding of the query image\nbased on the information provided by the support image, we introduce a\nmulti-scale large kernel attention module. By selectively emphasizing relevant\nfeatures and details, this module improves the segmentation process and\ncontributes to better object delineation. Evaluations on both natural and\nmedical image datasets demonstrate the efficiency and effectiveness of our\nmethod. Moreover, the proposed approach is characterized by its generality and\nmodel-agnostic nature, allowing for seamless integration with various deep\narchitectures. The code is publicly available at\n\\href{https://github.com/mindflow-institue/annotation_free_fewshot}{\\textcolor{magenta}{GitHub}}.\n","authors":["Sanaz Karimijafarbigloo","Reza Azad","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2307.14446v1.pdf","comment":"MICCAI 2023 workshop PRIME"},{"id":"http://arxiv.org/abs/2212.13504v3","updated":"2023-07-26T18:32:36Z","published":"2022-12-27T14:39:39Z","title":"DAE-Former: Dual Attention-guided Efficient Transformer for Medical\n  Image Segmentation","summary":"  Transformers have recently gained attention in the computer vision domain due\nto their ability to model long-range dependencies. However, the self-attention\nmechanism, which is the core part of the Transformer model, usually suffers\nfrom quadratic computational complexity with respect to the number of tokens.\nMany architectures attempt to reduce model complexity by limiting the\nself-attention mechanism to local regions or by redesigning the tokenization\nprocess. In this paper, we propose DAE-Former, a novel method that seeks to\nprovide an alternative perspective by efficiently designing the self-attention\nmechanism. More specifically, we reformulate the self-attention mechanism to\ncapture both spatial and channel relations across the whole feature dimension\nwhile staying computationally efficient. Furthermore, we redesign the skip\nconnection path by including the cross-attention module to ensure the feature\nreusability and enhance the localization power. Our method outperforms\nstate-of-the-art methods on multi-organ cardiac and skin lesion segmentation\ndatasets without requiring pre-training weights. The code is publicly available\nat https://github.com/mindflow-institue/DAEFormer.\n","authors":["Reza Azad","René Arimond","Ehsan Khodapanah Aghdam","Amirhossein Kazerouni","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2212.13504v3.pdf","comment":"MICCAI 2023 PRIME workshop"},{"id":"http://arxiv.org/abs/2307.14436v1","updated":"2023-07-26T18:13:16Z","published":"2023-07-26T18:13:16Z","title":"Phenotype-preserving metric design for high-content image reconstruction\n  by generative inpainting","summary":"  In the past decades, automated high-content microscopy demonstrated its\nability to deliver large quantities of image-based data powering the\nversatility of phenotypic drug screening and systems biology applications.\nHowever, as the sizes of image-based datasets grew, it became infeasible for\nhumans to control, avoid and overcome the presence of imaging and sample\npreparation artefacts in the images. While novel techniques like machine\nlearning and deep learning may address these shortcomings through generative\nimage inpainting, when applied to sensitive research data this may come at the\ncost of undesired image manipulation. Undesired manipulation may be caused by\nphenomena such as neural hallucinations, to which some artificial neural\nnetworks are prone. To address this, here we evaluate the state-of-the-art\ninpainting methods for image restoration in a high-content fluorescence\nmicroscopy dataset of cultured cells with labelled nuclei. We show that\narchitectures like DeepFill V2 and Edge Connect can faithfully restore\nmicroscopy images upon fine-tuning with relatively little data. Our results\ndemonstrate that the area of the region to be restored is of higher importance\nthan shape. Furthermore, to control for the quality of restoration, we propose\na novel phenotype-preserving metric design strategy. In this strategy, the size\nand count of the restored biological phenotypes like cell nuclei are quantified\nto penalise undesirable manipulation. We argue that the design principles of\nour approach may also generalise to other applications.\n","authors":["Vaibhav Sharma","Artur Yakimovich"],"pdf_url":"https://arxiv.org/pdf/2307.14436v1.pdf","comment":"8 pages, 3 figures, conference proceedings"},{"id":"http://arxiv.org/abs/2307.11934v2","updated":"2023-07-26T18:08:10Z","published":"2023-07-21T23:00:43Z","title":"LAMP: Leveraging Language Prompts for Multi-person Pose Estimation","summary":"  Human-centric visual understanding is an important desideratum for effective\nhuman-robot interaction. In order to navigate crowded public places, social\nrobots must be able to interpret the activity of the surrounding humans. This\npaper addresses one key aspect of human-centric visual understanding,\nmulti-person pose estimation. Achieving good performance on multi-person pose\nestimation in crowded scenes is difficult due to the challenges of occluded\njoints and instance separation. In order to tackle these challenges and\novercome the limitations of image features in representing invisible body\nparts, we propose a novel prompt-based pose inference strategy called LAMP\n(Language Assisted Multi-person Pose estimation). By utilizing the text\nrepresentations generated by a well-trained language model (CLIP), LAMP can\nfacilitate the understanding of poses on the instance and joint levels, and\nlearn more robust visual representations that are less susceptible to\nocclusion. This paper demonstrates that language-supervised training boosts the\nperformance of single-stage multi-person pose estimation, and both\ninstance-level and joint-level prompts are valuable for training. The code is\navailable at https://github.com/shengnanh20/LAMP.\n","authors":["Shengnan Hu","Ce Zheng","Zixiang Zhou","Chen Chen","Gita Sukthankar"],"pdf_url":"https://arxiv.org/pdf/2307.11934v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14433v1","updated":"2023-07-26T18:06:25Z","published":"2023-07-26T18:06:25Z","title":"ProtoASNet: Dynamic Prototypes for Inherently Interpretable and\n  Uncertainty-Aware Aortic Stenosis Classification in Echocardiography","summary":"  Aortic stenosis (AS) is a common heart valve disease that requires accurate\nand timely diagnosis for appropriate treatment. Most current automatic AS\nseverity detection methods rely on black-box models with a low level of\ntrustworthiness, which hinders clinical adoption. To address this issue, we\npropose ProtoASNet, a prototypical network that directly detects AS from B-mode\nechocardiography videos, while making interpretable predictions based on the\nsimilarity between the input and learned spatio-temporal prototypes. This\napproach provides supporting evidence that is clinically relevant, as the\nprototypes typically highlight markers such as calcification and restricted\nmovement of aortic valve leaflets. Moreover, ProtoASNet utilizes abstention\nloss to estimate aleatoric uncertainty by defining a set of prototypes that\ncapture ambiguity and insufficient information in the observed data. This\nprovides a reliable system that can detect and explain when it may fail. We\nevaluate ProtoASNet on a private dataset and the publicly available TMED-2\ndataset, where it outperforms existing state-of-the-art methods with an\naccuracy of 80.0% and 79.7%, respectively. Furthermore, ProtoASNet provides\ninterpretability and an uncertainty measure for each prediction, which can\nimprove transparency and facilitate the interactive usage of deep networks to\naid clinical decision-making. Our source code is available at:\nhttps://github.com/hooman007/ProtoASNet.\n","authors":["Hooman Vaseli","Ang Nan Gu","S. Neda Ahmadi Amiri","Michael Y. Tsang","Andrea Fung","Nima Kondori","Armin Saadat","Purang Abolmaesumi","Teresa S. M. Tsang"],"pdf_url":"https://arxiv.org/pdf/2307.14433v1.pdf","comment":"To be published in MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.14403v1","updated":"2023-07-26T17:25:28Z","published":"2023-07-26T17:25:28Z","title":"Unsupervised Deep Learning-based Pansharpening with Jointly-Enhanced\n  Spectral and Spatial Fidelity","summary":"  In latest years, deep learning has gained a leading role in the pansharpening\nof multiresolution images. Given the lack of ground truth data, most deep\nlearning-based methods carry out supervised training in a reduced-resolution\ndomain. However, models trained on downsized images tend to perform poorly on\nhigh-resolution target images. For this reason, several research groups are now\nturning to unsupervised training in the full-resolution domain, through the\ndefinition of appropriate loss functions and training paradigms. In this\ncontext, we have recently proposed a full-resolution training framework which\ncan be applied to many existing architectures.\n  Here, we propose a new deep learning-based pansharpening model that fully\nexploits the potential of this approach and provides cutting-edge performance.\nBesides architectural improvements with respect to previous work, such as the\nuse of residual attention modules, the proposed model features a novel loss\nfunction that jointly promotes the spectral and spatial quality of the\npansharpened data. In addition, thanks to a new fine-tuning strategy, it\nimproves inference-time adaptation to target images. Experiments on a large\nvariety of test images, performed in challenging scenarios, demonstrate that\nthe proposed method compares favorably with the state of the art both in terms\nof numerical results and visual output. Code is available online at\nhttps://github.com/matciotola/Lambda-PNN.\n","authors":["Matteo Ciotola","Giovanni Poggi","Giuseppe Scarpa"],"pdf_url":"https://arxiv.org/pdf/2307.14403v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.14298v1","updated":"2023-07-26T16:58:10Z","published":"2023-07-26T16:58:10Z","title":"ChatGPT and Persuasive Technologies for the Management and Delivery of\n  Personalized Recommendations in Hotel Hospitality","summary":"  Recommender systems have become indispensable tools in the hotel hospitality\nindustry, enabling personalized and tailored experiences for guests. Recent\nadvancements in large language models (LLMs), such as ChatGPT, and persuasive\ntechnologies, have opened new avenues for enhancing the effectiveness of those\nsystems. This paper explores the potential of integrating ChatGPT and\npersuasive technologies for automating and improving hotel hospitality\nrecommender systems. First, we delve into the capabilities of ChatGPT, which\ncan understand and generate human-like text, enabling more accurate and\ncontext-aware recommendations. We discuss the integration of ChatGPT into\nrecommender systems, highlighting the ability to analyze user preferences,\nextract valuable insights from online reviews, and generate personalized\nrecommendations based on guest profiles. Second, we investigate the role of\npersuasive technology in influencing user behavior and enhancing the persuasive\nimpact of hotel recommendations. By incorporating persuasive techniques, such\nas social proof, scarcity and personalization, recommender systems can\neffectively influence user decision-making and encourage desired actions, such\nas booking a specific hotel or upgrading their room. To investigate the\nefficacy of ChatGPT and persuasive technologies, we present a pilot experi-ment\nwith a case study involving a hotel recommender system. We aim to study the\nimpact of integrating ChatGPT and persua-sive techniques on user engagement,\nsatisfaction, and conversion rates. The preliminary results demonstrate the\npotential of these technologies in enhancing the overall guest experience and\nbusiness performance. Overall, this paper contributes to the field of hotel\nhospitality by exploring the synergistic relationship between LLMs and\npersuasive technology in recommender systems, ultimately influencing guest\nsatisfaction and hotel revenue.\n","authors":["Manolis Remountakis","Konstantinos Kotis","Babis Kourtzis","George E. Tsekouras"],"pdf_url":"https://arxiv.org/pdf/2307.14298v1.pdf","comment":"17 pages, 12 figures"},{"id":"http://arxiv.org/abs/2307.14225v1","updated":"2023-07-26T14:47:15Z","published":"2023-07-26T14:47:15Z","title":"Large Language Models are Competitive Near Cold-start Recommenders for\n  Language- and Item-based Preferences","summary":"  Traditional recommender systems leverage users' item preference history to\nrecommend novel content that users may like. However, modern dialog interfaces\nthat allow users to express language-based preferences offer a fundamentally\ndifferent modality for preference input. Inspired by recent successes of\nprompting paradigms for large language models (LLMs), we study their use for\nmaking recommendations from both item-based and language-based preferences in\ncomparison to state-of-the-art item-based collaborative filtering (CF) methods.\nTo support this investigation, we collect a new dataset consisting of both\nitem-based and language-based preferences elicited from users along with their\nratings on a variety of (biased) recommended items and (unbiased) random items.\nAmong numerous experimental results, we find that LLMs provide competitive\nrecommendation performance for pure language-based preferences (no item\npreferences) in the near cold-start case in comparison to item-based CF\nmethods, despite having no supervised training for this specific task\n(zero-shot) or only a few labels (few-shot). This is particularly promising as\nlanguage-based preference representations are more explainable and scrutable\nthan item-based or vector-based representations.\n","authors":["Scott Sanner","Krisztian Balog","Filip Radlinski","Ben Wedin","Lucas Dixon"],"pdf_url":"https://arxiv.org/pdf/2307.14225v1.pdf","comment":"To appear at RecSys'23"},{"id":"http://arxiv.org/abs/2307.14059v1","updated":"2023-07-26T09:12:49Z","published":"2023-07-26T09:12:49Z","title":"A Probabilistic Position Bias Model for Short-Video Recommendation Feeds","summary":"  Modern web-based platforms show ranked lists of recommendations to users,\nattempting to maximise user satisfaction or business metrics. Typically, the\ngoal of such systems boils down to maximising the exposure probability for\nitems that are deemed \"reward-maximising\" according to a metric of interest.\nThis general framing comprises streaming applications, as well as e-commerce or\njob recommendations, and even web search. Position bias or user models can be\nused to estimate exposure probabilities for each use-case, specifically\ntailored to how users interact with the presented rankings. A unifying factor\nin these diverse problem settings is that typically only one or several items\nwill be engaged with (clicked, streamed,...) before a user leaves the ranked\nlist. Short-video feeds on social media platforms diverge from this general\nframing in several ways, most notably that users do not tend to leave the feed\nafter e.g. liking a post. Indeed, seemingly infinite feeds invite users to\nscroll further down the ranked list. For this reason, existing position bias or\nuser models tend to fall short in such settings, as they do not accurately\ncapture users' interaction modalities.\n  In this work, we propose a novel and probabilistically sound personalised\nposition bias model for feed recommendations. We focus on a 1st-level feed in a\nhierarchical structure, where users may enter a 2nd-level feed via any given\n1st-level item. We posit that users come to the platform with a scrolling\nbudget drawn according to some distribution, and show how the survival function\nof said distribution can be used to obtain closed-form estimates for\npersonalised exposure probabilities. Empirical insights from a large-scale\nsocial media platform show how our probabilistic position bias model more\naccurately captures empirical exposure than existing models, and paves the way\nfor unbiased evaluation and learning-to-rank.\n","authors":["Olivier Jeunen"],"pdf_url":"https://arxiv.org/pdf/2307.14059v1.pdf","comment":"Appearing in the Proceedings of the Seventeenth ACM Conference on\n  Recommender Systems (RecSys '23)"},{"id":"http://arxiv.org/abs/2305.15145v2","updated":"2023-07-26T08:18:38Z","published":"2023-05-24T13:39:14Z","title":"Bert4XMR: Cross-Market Recommendation with Bidirectional Encoder\n  Representations from Transformer","summary":"  Real-world multinational e-commerce companies, such as Amazon and eBay, serve\nin multiple countries and regions. Some markets are data-scarce, while others\nare data-rich. In recent years, cross-market recommendation (XMR) has been\nproposed to bolster data-scarce markets by leveraging auxiliary information\nfrom data-rich markets. Previous XMR algorithms have employed techniques such\nas sharing bottom or incorporating inter-market similarity to optimize the\nperformance of XMR. However, the existing approaches suffer from two crucial\nlimitations: (1) They ignore the co-occurrences of items provided by data-rich\nmarkets. (2) They do not adequately tackle the issue of negative transfer\nstemming from disparities across diverse markets. In order to address these\nlimitations, we propose a novel session-based model called Bert4XMR, which is\nable to model item co-occurrences across markets and mitigate negative\ntransfer. Specifically, we employ the pre-training and fine-tuning paradigm to\nfacilitate knowledge transfer across markets. Pre-training occurs on global\nmarkets to learn item co-occurrences, while fine-tuning happens in the target\nmarket for model customization. To mitigate potential negative transfer, we\nseparate the item representations into market embeddings and item embeddings.\nMarket embeddings model the bias associated with different markets, while item\nembeddings learn generic item representations. Extensive experiments conducted\non seven real-world datasets illustrate our model's effectiveness. It\noutperforms the suboptimal model by an average of $4.82\\%$, $4.73\\%$, $7.66\\%$,\nand $6.49\\%$ across four metrics. Through the ablation study, we experimentally\ndemonstrate that the market embedding approach helps prevent negative transfer,\nespecially in data-scarce markets. Our implementations are available at\nhttps://github.com/laowangzi/Bert4XMR.\n","authors":["Zheng Hu","Satoshi Nakagawa","Shi-Min Cai","Fuji Ren"],"pdf_url":"https://arxiv.org/pdf/2305.15145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02575v2","updated":"2023-07-26T08:16:50Z","published":"2023-05-04T06:14:34Z","title":"Towards Hierarchical Policy Learning for Conversational Recommendation\n  with Hypergraph-based Reinforcement Learning","summary":"  Conversational recommendation systems (CRS) aim to timely and proactively\nacquire user dynamic preferred attributes through conversations for item\nrecommendation. In each turn of CRS, there naturally have two decision-making\nprocesses with different roles that influence each other: 1) director, which is\nto select the follow-up option (i.e., ask or recommend) that is more effective\nfor reducing the action space and acquiring user preferences; and 2) actor,\nwhich is to accordingly choose primitive actions (i.e., asked attribute or\nrecommended item) that satisfy user preferences and give feedback to estimate\nthe effectiveness of the director's option. However, existing methods heavily\nrely on a unified decision-making module or heuristic rules, while neglecting\nto distinguish the roles of different decision procedures, as well as the\nmutual influences between them. To address this, we propose a novel\nDirector-Actor Hierarchical Conversational Recommender (DAHCR), where the\ndirector selects the most effective option, followed by the actor accordingly\nchoosing primitive actions that satisfy user preferences. Specifically, we\ndevelop a dynamic hypergraph to model user preferences and introduce an\nintrinsic motivation to train from weak supervision over the director. Finally,\nto alleviate the bad effect of model bias on the mutual influence between the\ndirector and actor, we model the director's option by sampling from a\ncategorical distribution. Extensive experiments demonstrate that DAHCR\noutperforms state-of-the-art methods.\n","authors":["Sen Zhao","Wei Wei","Yifan Liu","Ziyang Wang","Wendi Li","Xian-Ling Mao","Shuai Zhu","Minghui Yang","Zujie Wen"],"pdf_url":"https://arxiv.org/pdf/2305.02575v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14024v1","updated":"2023-07-26T08:08:05Z","published":"2023-07-26T08:08:05Z","title":"Multi-view Hypergraph Contrastive Policy Learning for Conversational\n  Recommendation","summary":"  Conversational recommendation systems (CRS) aim to interactively acquire user\npreferences and accordingly recommend items to users. Accurately learning the\ndynamic user preferences is of crucial importance for CRS. Previous works learn\nthe user preferences with pairwise relations from the interactive conversation\nand item knowledge, while largely ignoring the fact that factors for a\nrelationship in CRS are multiplex. Specifically, the user likes/dislikes the\nitems that satisfy some attributes (Like/Dislike view). Moreover social\ninfluence is another important factor that affects user preference towards the\nitem (Social view), while is largely ignored by previous works in CRS. The user\npreferences from these three views are inherently different but also correlated\nas a whole. The user preferences from the same views should be more similar\nthan that from different views. The user preferences from Like View should be\nsimilar to Social View while different from Dislike View. To this end, we\npropose a novel model, namely Multi-view Hypergraph Contrastive Policy Learning\n(MHCPL). Specifically, MHCPL timely chooses useful social information according\nto the interactive history and builds a dynamic hypergraph with three types of\nmultiplex relations from different views. The multiplex relations in each view\nare successively connected according to their generation order.\n","authors":["Sen Zhao","Wei Wei","Xian-Ling Mao","Shuai Zhu","Minghui Yang","Zujie Wen","Dangyang Chen","Feida Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.14024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.14226v2","updated":"2023-07-26T06:24:26Z","published":"2021-06-27T12:57:31Z","title":"Sequential Recommendation with Graph Neural Networks","summary":"  Sequential recommendation aims to leverage users' historical behaviors to\npredict their next interaction. Existing works have not yet addressed two main\nchallenges in sequential recommendation. First, user behaviors in their rich\nhistorical sequences are often implicit and noisy preference signals, they\ncannot sufficiently reflect users' actual preferences. In addition, users'\ndynamic preferences often change rapidly over time, and hence it is difficult\nto capture user patterns in their historical sequences. In this work, we\npropose a graph neural network model called SURGE (short for SeqUential\nRecommendation with Graph neural nEtworks) to address these two issues.\nSpecifically, SURGE integrates different types of preferences in long-term user\nbehaviors into clusters in the graph by re-constructing loose item sequences\ninto tight item-item interest graphs based on metric learning. This helps\nexplicitly distinguish users' core interests, by forming dense clusters in the\ninterest graph. Then, we perform cluster-aware and query-aware graph\nconvolutional propagation and graph pooling on the constructed graph. It\ndynamically fuses and extracts users' current activated core interests from\nnoisy user behavior sequences. We conduct extensive experiments on both public\nand proprietary industrial datasets. Experimental results demonstrate\nsignificant performance gains of our proposed method compared to\nstate-of-the-art methods. Further studies on sequence length confirm that our\nmethod can model long behavioral sequences effectively and efficiently.\n","authors":["Jianxin Chang","Chen Gao","Yu Zheng","Yiqun Hui","Yanan Niu","Yang Song","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2106.14226v2.pdf","comment":"Accepted by SIGIR 2021"},{"id":"http://arxiv.org/abs/2205.13619v5","updated":"2023-07-26T03:20:47Z","published":"2022-05-26T20:48:53Z","title":"Fairness in Recommendation: Foundations, Methods and Applications","summary":"  As one of the most pervasive applications of machine learning, recommender\nsystems are playing an important role on assisting human decision making. The\nsatisfaction of users and the interests of platforms are closely related to the\nquality of the generated recommendation results. However, as a highly\ndata-driven system, recommender system could be affected by data or algorithmic\nbias and thus generate unfair results, which could weaken the reliance of the\nsystems. As a result, it is crucial to address the potential unfairness\nproblems in recommendation settings. Recently, there has been growing attention\non fairness considerations in recommender systems with more and more literature\non approaches to promote fairness in recommendation. However, the studies are\nrather fragmented and lack a systematic organization, thus making it difficult\nto penetrate for new researchers to the domain. This motivates us to provide a\nsystematic survey of existing works on fairness in recommendation. This survey\nfocuses on the foundations for fairness in recommendation literature. It first\npresents a brief introduction about fairness in basic machine learning tasks\nsuch as classification and ranking in order to provide a general overview of\nfairness research, as well as introduce the more complex situations and\nchallenges that need to be considered when studying fairness in recommender\nsystems. After that, the survey will introduce fairness in recommendation with\na focus on the taxonomies of current fairness definitions, the typical\ntechniques for improving fairness, as well as the datasets for fairness studies\nin recommendation. The survey also talks about the challenges and opportunities\nin fairness research with the hope of promoting the fair recommendation\nresearch area and beyond.\n","authors":["Yunqi Li","Hanxiong Chen","Shuyuan Xu","Yingqiang Ge","Juntao Tan","Shuchang Liu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2205.13619v5.pdf","comment":"38 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.13910v1","updated":"2023-07-26T02:24:23Z","published":"2023-07-26T02:24:23Z","title":"Domain Disentanglement with Interpolative Data Augmentation for\n  Dual-Target Cross-Domain Recommendation","summary":"  The conventional single-target Cross-Domain Recommendation (CDR) aims to\nimprove the recommendation performance on a sparser target domain by\ntransferring the knowledge from a source domain that contains relatively richer\ninformation. By contrast, in recent years, dual-target CDR has been proposed to\nimprove the recommendation performance on both domains simultaneously. However,\nto this end, there are two challenges in dual-target CDR: (1) how to generate\nboth relevant and diverse augmented user representations, and (2) how to\neffectively decouple domain-independent information from domain-specific\ninformation, in addition to domain-shared information, to capture comprehensive\nuser preferences. To address the above two challenges, we propose a\nDisentanglement-based framework with Interpolative Data Augmentation for\ndual-target Cross-Domain Recommendation, called DIDA-CDR. In DIDA-CDR, we first\npropose an interpolative data augmentation approach to generating both relevant\nand diverse augmented user representations to augment sparser domain and\nexplore potential user preferences. We then propose a disentanglement module to\neffectively decouple domain-specific and domain-independent information to\ncapture comprehensive user preferences. Both steps significantly contribute to\ncapturing more comprehensive user preferences, thereby improving the\nrecommendation performance on each domain. Extensive experiments conducted on\nfive real-world datasets show the significant superiority of DIDA-CDR over the\nstate-of-the-art methods.\n","authors":["Jiajie Zhu","Yan Wang","Feng Zhu","Zhu Sun"],"pdf_url":"https://arxiv.org/pdf/2307.13910v1.pdf","comment":"Accepted by RecSys 2023"},{"id":"http://arxiv.org/abs/2307.14450v1","updated":"2023-07-26T18:48:41Z","published":"2023-07-26T18:48:41Z","title":"Integrating Offline Reinforcement Learning with Transformers for\n  Sequential Recommendation","summary":"  We consider the problem of sequential recommendation, where the current\nrecommendation is made based on past interactions. This recommendation task\nrequires efficient processing of the sequential data and aims to provide\nrecommendations that maximize the long-term reward. To this end, we train a\nfarsighted recommender by using an offline RL algorithm with the policy network\nin our model architecture that has been initialized from a pre-trained\ntransformer model. The pre-trained model leverages the superb ability of the\ntransformer to process sequential information. Compared to prior works that\nrely on online interaction via simulation, we focus on implementing a fully\noffline RL framework that is able to converge in a fast and stable way. Through\nextensive experiments on public datasets, we show that our method is robust\nacross various recommendation regimes, including e-commerce and movie\nsuggestions. Compared to state-of-the-art supervised learning algorithms, our\nalgorithm yields recommendations of higher quality, demonstrating the clear\nadvantage of combining RL and transformers.\n","authors":["Xumei Xi","Yuke Zhao","Quan Liu","Liwen Ouyang","Yang Wu"],"pdf_url":"https://arxiv.org/pdf/2307.14450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14401v1","updated":"2023-07-26T16:25:17Z","published":"2023-07-26T16:25:17Z","title":"Measuring Americanization: A Global Quantitative Study of Interest in\n  American Topics on Wikipedia","summary":"  We conducted a global comparative analysis of the coverage of American topics\nin different language versions of Wikipedia, using over 90 million Wikidata\nitems and 40 million Wikipedia articles in 58 languages. Our study aimed to\ninvestigate whether Americanization is more or less dominant in different\nregions and cultures and to determine whether interest in American topics is\nuniversal.\n","authors":["Piotr Konieczny","Włodzimierz Lewoniewski"],"pdf_url":"https://arxiv.org/pdf/2307.14401v1.pdf","comment":"Extended and interactive version of bubble chart with Wikipedia\n  languages: https://data.lewoniewski.info/americanwiki"},{"id":"http://arxiv.org/abs/2307.15084v1","updated":"2023-07-26T05:54:06Z","published":"2023-07-26T05:54:06Z","title":"Mathematical Modeling of BCG-based Bladder Cancer Treatment Using\n  Socio-Demographics","summary":"  Cancer is one of the most widespread diseases around the world with millions\nof new patients each year. Bladder cancer is one of the most prevalent types of\ncancer affecting all individuals alike with no obvious prototypical patient.\nThe current standard treatment for BC follows a routine weekly Bacillus\nCalmette-Guerin (BCG) immunotherapy-based therapy protocol which is applied to\nall patients alike. The clinical outcomes associated with BCG treatment vary\nsignificantly among patients due to the biological and clinical complexity of\nthe interaction between the immune system, treatments, and cancer cells. In\nthis study, we take advantage of the patient's socio-demographics to offer a\npersonalized mathematical model that describes the clinical dynamics associated\nwith BCG-based treatment. To this end, we adopt a well-established BCG\ntreatment model and integrate a machine learning component to temporally adjust\nand reconfigure key parameters within the model thus promoting its\npersonalization. Using real clinical data, we show that our personalized model\nfavorably compares with the original one in predicting the number of cancer\ncells at the end of the treatment, with 14.8% improvement, on average.\n","authors":["Elizaveta Savchenko","Ariel Rosenfeld","Svetlana Bunimovich-Mendrazitsky"],"pdf_url":"https://arxiv.org/pdf/2307.15084v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.14338v1","updated":"2023-07-26T17:58:07Z","published":"2023-07-26T17:58:07Z","title":"TabR: Unlocking the Power of Retrieval-Augmented Tabular Deep Learning","summary":"  Deep learning (DL) models for tabular data problems are receiving\nincreasingly more attention, while the algorithms based on gradient-boosted\ndecision trees (GBDT) remain a strong go-to solution. Following the recent\ntrends in other domains, such as natural language processing and computer\nvision, several retrieval-augmented tabular DL models have been recently\nproposed. For a given target object, a retrieval-based model retrieves other\nrelevant objects, such as the nearest neighbors, from the available (training)\ndata and uses their features or even labels to make a better prediction.\nHowever, we show that the existing retrieval-based tabular DL solutions provide\nonly minor, if any, benefits over the properly tuned simple retrieval-free\nbaselines. Thus, it remains unclear whether the retrieval-based approach is a\nworthy direction for tabular DL.\n  In this work, we give a strong positive answer to this question. We start by\nincrementally augmenting a simple feed-forward architecture with an\nattention-like retrieval component similar to those of many (tabular)\nretrieval-based models. Then, we highlight several details of the attention\nmechanism that turn out to have a massive impact on the performance on tabular\ndata problems, but that were not explored in prior work. As a result, we design\nTabR -- a simple retrieval-based tabular DL model which, on a set of public\nbenchmarks, demonstrates the best average performance among tabular DL models,\nbecomes the new state-of-the-art on several datasets, and even outperforms GBDT\nmodels on the recently proposed ``GBDT-friendly'' benchmark (see the first\nfigure).\n","authors":["Yury Gorishniy","Ivan Rubachev","Nikolay Kartashev","Daniil Shlenskii","Akim Kotelnikov","Artem Babenko"],"pdf_url":"https://arxiv.org/pdf/2307.14338v1.pdf","comment":"Code: https://github.com/yandex-research/tabular-dl-tabr"},{"id":"http://arxiv.org/abs/2307.14326v1","updated":"2023-07-26T17:45:55Z","published":"2023-07-26T17:45:55Z","title":"Waypoint-Based Imitation Learning for Robotic Manipulation","summary":"  While imitation learning methods have seen a resurgent interest for robotic\nmanipulation, the well-known problem of compounding errors continues to afflict\nbehavioral cloning (BC). Waypoints can help address this problem by reducing\nthe horizon of the learning problem for BC, and thus, the errors compounded\nover time. However, waypoint labeling is underspecified, and requires\nadditional human supervision. Can we generate waypoints automatically without\nany additional human supervision? Our key insight is that if a trajectory\nsegment can be approximated by linear motion, the endpoints can be used as\nwaypoints. We propose Automatic Waypoint Extraction (AWE) for imitation\nlearning, a preprocessing module to decompose a demonstration into a minimal\nset of waypoints which when interpolated linearly can approximate the\ntrajectory up to a specified error threshold. AWE can be combined with any BC\nalgorithm, and we find that AWE can increase the success rate of\nstate-of-the-art algorithms by up to 25% in simulation and by 4-28% on\nreal-world bimanual manipulation tasks, reducing the decision making horizon by\nup to a factor of 10. Videos and code are available at\nhttps://lucys0.github.io/awe/\n","authors":["Lucy Xiaoyang Shi","Archit Sharma","Tony Z. Zhao","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2307.14326v1.pdf","comment":"The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2307.14324v1","updated":"2023-07-26T17:42:43Z","published":"2023-07-26T17:42:43Z","title":"Evaluating the Moral Beliefs Encoded in LLMs","summary":"  This paper presents a case study on the design, administration,\npost-processing, and evaluation of surveys on large language models (LLMs). It\ncomprises two components: (1) A statistical method for eliciting beliefs\nencoded in LLMs. We introduce statistical measures and evaluation metrics that\nquantify the probability of an LLM \"making a choice\", the associated\nuncertainty, and the consistency of that choice. (2) We apply this method to\nstudy what moral beliefs are encoded in different LLMs, especially in ambiguous\ncases where the right choice is not obvious. We design a large-scale survey\ncomprising 680 high-ambiguity moral scenarios (e.g., \"Should I tell a white\nlie?\") and 687 low-ambiguity moral scenarios (e.g., \"Should I stop for a\npedestrian on the road?\"). Each scenario includes a description, two possible\nactions, and auxiliary labels indicating violated rules (e.g., \"do not kill\").\nWe administer the survey to 28 open- and closed-source LLMs. We find that (a)\nin unambiguous scenarios, most models \"choose\" actions that align with\ncommonsense. In ambiguous cases, most models express uncertainty. (b) Some\nmodels are uncertain about choosing the commonsense action because their\nresponses are sensitive to the question-wording. (c) Some models reflect clear\npreferences in ambiguous scenarios. Specifically, closed-source models tend to\nagree with each other.\n","authors":["Nino Scherrer","Claudia Shi","Amir Feder","David M. Blei"],"pdf_url":"https://arxiv.org/pdf/2307.14324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14316v1","updated":"2023-07-26T17:26:21Z","published":"2023-07-26T17:26:21Z","title":"Reinforcement Learning by Guided Safe Exploration","summary":"  Safety is critical to broadening the application of reinforcement learning\n(RL). Often, we train RL agents in a controlled environment, such as a\nlaboratory, before deploying them in the real world. However, the real-world\ntarget task might be unknown prior to deployment. Reward-free RL trains an\nagent without the reward to adapt quickly once the reward is revealed. We\nconsider the constrained reward-free setting, where an agent (the guide) learns\nto explore safely without the reward signal. This agent is trained in a\ncontrolled environment, which allows unsafe interactions and still provides the\nsafety signal. After the target task is revealed, safety violations are not\nallowed anymore. Thus, the guide is leveraged to compose a safe behaviour\npolicy. Drawing from transfer learning, we also regularize a target policy (the\nstudent) towards the guide while the student is unreliable and gradually\neliminate the influence of the guide as training progresses. The empirical\nanalysis shows that this method can achieve safe transfer learning and helps\nthe student solve the target task faster.\n","authors":["Qisong Yang","Thiago D. Simão","Nils Jansen","Simon H. Tindemans","Matthijs T. J. Spaan"],"pdf_url":"https://arxiv.org/pdf/2307.14316v1.pdf","comment":"Accecpted at ECAI 2023"},{"id":"http://arxiv.org/abs/2307.14304v1","updated":"2023-07-26T17:12:04Z","published":"2023-07-26T17:12:04Z","title":"A Constraint Enforcement Deep Reinforcement Learning Framework for\n  Optimal Energy Storage Systems Dispatch","summary":"  The optimal dispatch of energy storage systems (ESSs) presents formidable\nchallenges due to the uncertainty introduced by fluctuations in dynamic prices,\ndemand consumption, and renewable-based energy generation. By exploiting the\ngeneralization capabilities of deep neural networks (DNNs), deep reinforcement\nlearning (DRL) algorithms can learn good-quality control models that adaptively\nrespond to distribution networks' stochastic nature. However, current DRL\nalgorithms lack the capabilities to enforce operational constraints strictly,\noften even providing unfeasible control actions. To address this issue, we\npropose a DRL framework that effectively handles continuous action spaces while\nstrictly enforcing the environments and action space operational constraints\nduring online operation. Firstly, the proposed framework trains an action-value\nfunction modeled using DNNs. Subsequently, this action-value function is\nformulated as a mixed-integer programming (MIP) formulation enabling the\nconsideration of the environment's operational constraints. Comprehensive\nnumerical simulations show the superior performance of the proposed MIP-DRL\nframework, effectively enforcing all constraints while delivering high-quality\ndispatch decisions when compared with state-of-the-art DRL algorithms and the\noptimal solution obtained with a perfect forecast of the stochastic variables.\n","authors":["Shengren Hou","Edgar Mauricio Salazar Duque","Peter Palensky","Pedro P. Vergara"],"pdf_url":"https://arxiv.org/pdf/2307.14304v1.pdf","comment":"This paper has been submitted to a publication in a journal. This\n  corresponds to the submitted version. After acceptance, it may be removed\n  depending on the journal's requirements for copyright"},{"id":"http://arxiv.org/abs/2206.04140v2","updated":"2023-07-26T17:05:12Z","published":"2022-06-08T20:06:23Z","title":"TreeFlow: Going beyond Tree-based Gaussian Probabilistic Regression","summary":"  The tree-based ensembles are known for their outstanding performance in\nclassification and regression problems characterized by feature vectors\nrepresented by mixed-type variables from various ranges and domains. However,\nconsidering regression problems, they are primarily designed to provide\ndeterministic responses or model the uncertainty of the output with Gaussian or\nparametric distribution. In this work, we introduce TreeFlow, the tree-based\napproach that combines the benefits of using tree ensembles with the\ncapabilities of modeling flexible probability distributions using normalizing\nflows. The main idea of the solution is to use a tree-based model as a feature\nextractor and combine it with a conditional variant of normalizing flow.\nConsequently, our approach is capable of modeling complex distributions for the\nregression outputs. We evaluate the proposed method on challenging regression\nbenchmarks with varying volume, feature characteristics, and target\ndimensionality. We obtain the SOTA results for both probabilistic and\ndeterministic metrics on datasets with multi-modal target distributions and\ncompetitive results on unimodal ones compared to tree-based regression\nbaselines.\n","authors":["Patryk Wielopolski","Maciej Zięba"],"pdf_url":"https://arxiv.org/pdf/2206.04140v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14298v1","updated":"2023-07-26T16:58:10Z","published":"2023-07-26T16:58:10Z","title":"ChatGPT and Persuasive Technologies for the Management and Delivery of\n  Personalized Recommendations in Hotel Hospitality","summary":"  Recommender systems have become indispensable tools in the hotel hospitality\nindustry, enabling personalized and tailored experiences for guests. Recent\nadvancements in large language models (LLMs), such as ChatGPT, and persuasive\ntechnologies, have opened new avenues for enhancing the effectiveness of those\nsystems. This paper explores the potential of integrating ChatGPT and\npersuasive technologies for automating and improving hotel hospitality\nrecommender systems. First, we delve into the capabilities of ChatGPT, which\ncan understand and generate human-like text, enabling more accurate and\ncontext-aware recommendations. We discuss the integration of ChatGPT into\nrecommender systems, highlighting the ability to analyze user preferences,\nextract valuable insights from online reviews, and generate personalized\nrecommendations based on guest profiles. Second, we investigate the role of\npersuasive technology in influencing user behavior and enhancing the persuasive\nimpact of hotel recommendations. By incorporating persuasive techniques, such\nas social proof, scarcity and personalization, recommender systems can\neffectively influence user decision-making and encourage desired actions, such\nas booking a specific hotel or upgrading their room. To investigate the\nefficacy of ChatGPT and persuasive technologies, we present a pilot experi-ment\nwith a case study involving a hotel recommender system. We aim to study the\nimpact of integrating ChatGPT and persua-sive techniques on user engagement,\nsatisfaction, and conversion rates. The preliminary results demonstrate the\npotential of these technologies in enhancing the overall guest experience and\nbusiness performance. Overall, this paper contributes to the field of hotel\nhospitality by exploring the synergistic relationship between LLMs and\npersuasive technology in recommender systems, ultimately influencing guest\nsatisfaction and hotel revenue.\n","authors":["Manolis Remountakis","Konstantinos Kotis","Babis Kourtzis","George E. Tsekouras"],"pdf_url":"https://arxiv.org/pdf/2307.14298v1.pdf","comment":"17 pages, 12 figures"},{"id":"http://arxiv.org/abs/2303.16109v2","updated":"2023-07-26T16:58:06Z","published":"2023-03-28T16:25:16Z","title":"Multimodal Manoeuvre and Trajectory Prediction for Automated Driving on\n  Highways Using Transformer Networks","summary":"  Predicting the behaviour (i.e., manoeuvre/trajectory) of other road users,\nincluding vehicles, is critical for the safe and efficient operation of\nautonomous vehicles (AVs), a.k.a., automated driving systems (ADSs). Due to the\nuncertain future behaviour of vehicles, multiple future behaviour modes are\noften plausible for a vehicle in a given driving scene. Therefore, multimodal\nprediction can provide richer information than single-mode prediction, enabling\nAVs to perform a better risk assessment. To this end, we propose a novel\nmultimodal prediction framework that can predict multiple plausible behaviour\nmodes and their likelihoods. The proposed framework includes a bespoke problem\nformulation for manoeuvre prediction, a novel transformer-based prediction\nmodel, and a tailored training method for multimodal manoeuvre and trajectory\nprediction. The performance of the framework is evaluated using three public\nhighway driving datasets, namely NGSIM, highD, and exiD. The results show that\nour framework outperforms the state-of-the-art multimodal methods in terms of\nprediction error and is capable of predicting plausible manoeuvre and\ntrajectory modes.\n","authors":["Sajjad Mozaffari","Mreza Alipour Sormoli","Konstantinos Koufos","Mehrdad Dianati"],"pdf_url":"https://arxiv.org/pdf/2303.16109v2.pdf","comment":"8 pages, 3 figures, submitted to IEEE RAL"},{"id":"http://arxiv.org/abs/2307.14294v1","updated":"2023-07-26T16:51:18Z","published":"2023-07-26T16:51:18Z","title":"Unraveling the Complexity of Splitting Sequential Data: Tackling\n  Challenges in Video and Time Series Analysis","summary":"  Splitting of sequential data, such as videos and time series, is an essential\nstep in various data analysis tasks, including object tracking and anomaly\ndetection. However, splitting sequential data presents a variety of challenges\nthat can impact the accuracy and reliability of subsequent analyses. This\nconcept article examines the challenges associated with splitting sequential\ndata, including data acquisition, data representation, split ratio selection,\nsetting up quality criteria, and choosing suitable selection strategies. We\nexplore these challenges through two real-world examples: motor test benches\nand particle tracking in liquids.\n","authors":["Diego Botache","Kristina Dingel","Rico Huhnstock","Arno Ehresmann","Bernhard Sick"],"pdf_url":"https://arxiv.org/pdf/2307.14294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.01364v2","updated":"2023-07-26T16:50:34Z","published":"2022-11-02T17:59:09Z","title":"An optimal control perspective on diffusion-based generative modeling","summary":"  We establish a connection between stochastic optimal control and generative\nmodels based on stochastic differential equations (SDEs), such as recently\ndeveloped diffusion probabilistic models. In particular, we derive a\nHamilton-Jacobi-Bellman equation that governs the evolution of the\nlog-densities of the underlying SDE marginals. This perspective allows to\ntransfer methods from optimal control theory to generative modeling. First, we\nshow that the evidence lower bound is a direct consequence of the well-known\nverification theorem from control theory. Further, we can formulate\ndiffusion-based generative modeling as a minimization of the Kullback-Leibler\ndivergence between suitable measures in path space. Finally, we develop a novel\ndiffusion-based method for sampling from unnormalized densities -- a problem\nfrequently occurring in statistics and computational sciences. We demonstrate\nthat our time-reversed diffusion sampler (DIS) can outperform other\ndiffusion-based sampling approaches on multiple numerical examples.\n","authors":["Julius Berner","Lorenz Richter","Karen Ullrich"],"pdf_url":"https://arxiv.org/pdf/2211.01364v2.pdf","comment":"Accepted for oral presentation at NeurIPS 2022 Workshop on\n  Score-Based Methods"},{"id":"http://arxiv.org/abs/2307.14283v1","updated":"2023-07-26T16:35:48Z","published":"2023-07-26T16:35:48Z","title":"General Purpose Artificial Intelligence Systems (GPAIS): Properties,\n  Definition, Taxonomy, Open Challenges and Implications","summary":"  Most applications of Artificial Intelligence (AI) are designed for a confined\nand specific task. However, there are many scenarios that call for a more\ngeneral AI, capable of solving a wide array of tasks without being specifically\ndesigned for them. The term General-Purpose Artificial Intelligence Systems\n(GPAIS) has been defined to refer to these AI systems. To date, the possibility\nof an Artificial General Intelligence, powerful enough to perform any\nintellectual task as if it were human, or even improve it, has remained an\naspiration, fiction, and considered a risk for our society. Whilst we might\nstill be far from achieving that, GPAIS is a reality and sitting at the\nforefront of AI research.\n  This work discusses existing definitions for GPAIS and proposes a new\ndefinition that allows for a gradual differentiation among types of GPAIS\naccording to their properties and limitations. We distinguish between\nclosed-world and open-world GPAIS, characterising their degree of autonomy and\nability based on several factors such as adaptation to new tasks, competence in\ndomains not intentionally trained for, ability to learn from few data, or\nproactive acknowledgment of their own limitations. We then propose a taxonomy\nof approaches to realise GPAIS, describing research trends such as the use of\nAI techniques to improve another AI or foundation models. As a prime example,\nwe delve into generative AI, aligning them with the terms and concepts\npresented in the taxonomy. Through the proposed definition and taxonomy, our\naim is to facilitate research collaboration across different areas that are\ntackling general-purpose tasks, as they share many common aspects. Finally, we\ndiscuss the current state of GPAIS, its challenges and prospects, implications\nfor our society, and the need for responsible and trustworthy AI systems and\nregulation, with the goal of providing a holistic view of GPAIS.\n","authors":["Isaac Triguero","Daniel Molina","Javier Poyatos","Javier Del Ser","Francisco Herrera"],"pdf_url":"https://arxiv.org/pdf/2307.14283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09013v2","updated":"2023-07-26T16:30:45Z","published":"2023-02-17T17:29:24Z","title":"Uniformity Testing over Hypergrids with Subcube Conditioning","summary":"  We give an algorithm for testing uniformity of distributions supported on\nhypergrids $[m_1] \\times \\cdots \\times [m_n]$, which makes\n$\\smash{\\widetilde{O}(\\text{poly}(m)\\sqrt{n}/\\epsilon^2)}$ many queries to a\nsubcube conditional sampling oracle with $m=\\max_i m_i$. When $m$ is a\nconstant, our algorithm is nearly optimal and strengthens the algorithm of\n[CCK+21] which has the same query complexity but works for hypercubes $\\{\\pm\n1\\}^n$ only.\n  A key technical contribution behind the analysis of our algorithm is a proof\nof a robust version of Pisier's inequality for functions over hypergrids using\nFourier analysis.\n","authors":["Xi Chen","Cassandra Marcussen"],"pdf_url":"https://arxiv.org/pdf/2302.09013v2.pdf","comment":"Extended results to the domain [m_1] x ... x [m_n] (previously was\n  [m]^n); substantial revisions to the introduction and conclusion of the paper"},{"id":"http://arxiv.org/abs/2210.03829v2","updated":"2023-07-26T16:26:52Z","published":"2022-10-07T21:49:26Z","title":"Early Detection of Bark Beetle Attack Using Remote Sensing and Machine\n  Learning: A Review","summary":"  This paper provides a comprehensive review of past and current advances in\nthe early detection of bark beetle-induced tree mortality from three primary\nperspectives: bark beetle & host interactions, RS, and ML/DL. In contrast to\nprior efforts, this review encompasses all RS systems and emphasizes ML/DL\nmethods to investigate their strengths and weaknesses. We parse existing\nliterature based on multi- or hyper-spectral analyses and distill their\nknowledge based on: bark beetle species & attack phases with a primary emphasis\non early stages of attacks, host trees, study regions, RS platforms & sensors,\nspectral/spatial/temporal resolutions, spectral signatures, spectral vegetation\nindices (SVIs), ML approaches, learning schemes, task categories, models,\nalgorithms, classes/clusters, features, and DL networks & architectures.\nAlthough DL-based methods and the random forest (RF) algorithm showed promising\nresults, highlighting their potential to detect subtle changes across visible,\nthermal, and short-wave infrared (SWIR) spectral regions, they still have\nlimited effectiveness and high uncertainties. To inspire novel solutions to\nthese shortcomings, we delve into the principal challenges & opportunities from\ndifferent perspectives, enabling a deeper understanding of the current state of\nresearch and guiding future research directions.\n","authors":["Seyed Mojtaba Marvasti-Zadeh","Devin Goodsman","Nilanjan Ray","Nadir Erbilgin"],"pdf_url":"https://arxiv.org/pdf/2210.03829v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2209.07028v2","updated":"2023-07-26T16:21:03Z","published":"2022-09-15T03:41:09Z","title":"Estimating large causal polytrees from small samples","summary":"  We consider the problem of estimating a large causal polytree from a\nrelatively small i.i.d. sample. This is motivated by the problem of determining\ncausal structure when the number of variables is very large compared to the\nsample size, such as in gene regulatory networks. We give an algorithm that\nrecovers the tree with high accuracy in such settings. The algorithm works\nunder essentially no distributional or modeling assumptions other than some\nmild non-degeneracy conditions.\n","authors":["Sourav Chatterjee","Mathukumalli Vidyasagar"],"pdf_url":"https://arxiv.org/pdf/2209.07028v2.pdf","comment":"24 pages. The title of the paper has been slightly modified, by\n  removing the word \"skeleton\". This is because the original version of the\n  paper had an algorithm for recovering only the skeleton, while in this\n  version, we have a way of recovering the directionalities of the arrows as\n  well"},{"id":"http://arxiv.org/abs/2201.11104v4","updated":"2023-07-26T16:13:33Z","published":"2022-01-26T18:29:00Z","title":"Combining optimal path search with task-dependent learning in a neural\n  network","summary":"  Finding optimal paths in connected graphs requires determining the smallest\ntotal cost for traveling along the graph's edges. This problem can be solved by\nseveral classical algorithms where, usually, costs are predefined for all\nedges. Conventional planning methods can, thus, normally not be used when\nwanting to change costs in an adaptive way following the requirements of some\ntask. Here we show that one can define a neural network representation of path\nfinding problems by transforming cost values into synaptic weights, which\nallows for online weight adaptation using network learning mechanisms. When\nstarting with an initial activity value of one, activity propagation in this\nnetwork will lead to solutions, which are identical to those found by the\nBellman-Ford algorithm. The neural network has the same algorithmic complexity\nas Bellman-Ford and, in addition, we can show that network learning mechanisms\n(such as Hebbian learning) can adapt the weights in the network augmenting the\nresulting paths according to some task at hand. We demonstrate this by learning\nto navigate in an environment with obstacles as well as by learning to follow\ncertain sequences of path nodes. Hence, the here-presented novel algorithm may\nopen up a different regime of applications where path-augmentation (by\nlearning) is directly coupled with path finding in a natural way.\n","authors":["Tomas Kulvicius","Minija Tamosiunaite","Florentin Wörgötter"],"pdf_url":"https://arxiv.org/pdf/2201.11104v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14273v1","updated":"2023-07-26T16:11:51Z","published":"2023-07-26T16:11:51Z","title":"Deepfake Image Generation for Improved Brain Tumor Segmentation","summary":"  As the world progresses in technology and health, awareness of disease by\nrevealing asymptomatic signs improves. It is important to detect and treat\ntumors in early stage as it can be life-threatening. Computer-aided\ntechnologies are used to overcome lingering limitations facing disease\ndiagnosis, while brain tumor segmentation remains a difficult process,\nespecially when multi-modality data is involved. This is mainly attributed to\nineffective training due to lack of data and corresponding labelling. This work\ninvestigates the feasibility of employing deep-fake image generation for\neffective brain tumor segmentation. To this end, a Generative Adversarial\nNetwork was used for image-to-image translation for increasing dataset size,\nfollowed by image segmentation using a U-Net-based convolutional neural network\ntrained with deepfake images. Performance of the proposed approach is compared\nwith ground truth of four publicly available datasets. Results show improved\nperformance in terms of image segmentation quality metrics, and could\npotentially assist when training with limited data.\n","authors":["Roa'a Al-Emaryeen","Sara Al-Nahhas","Fatima Himour","Waleed Mahafza","Omar Al-Kadi"],"pdf_url":"https://arxiv.org/pdf/2307.14273v1.pdf","comment":"6 pages, 8 figures, 2 tables, conference paper"},{"id":"http://arxiv.org/abs/2203.05556v3","updated":"2023-07-26T15:57:47Z","published":"2022-03-10T18:59:21Z","title":"On Embeddings for Numerical Features in Tabular Deep Learning","summary":"  Recently, Transformer-like deep architectures have shown strong performance\non tabular data problems. Unlike traditional models, e.g., MLP, these\narchitectures map scalar values of numerical features to high-dimensional\nembeddings before mixing them in the main backbone. In this work, we argue that\nembeddings for numerical features are an underexplored degree of freedom in\ntabular DL, which allows constructing more powerful DL models and competing\nwith GBDT on some traditionally GBDT-friendly benchmarks. We start by\ndescribing two conceptually different approaches to building embedding modules:\nthe first one is based on a piecewise linear encoding of scalar values, and the\nsecond one utilizes periodic activations. Then, we empirically demonstrate that\nthese two approaches can lead to significant performance boosts compared to the\nembeddings based on conventional blocks such as linear layers and ReLU\nactivations. Importantly, we also show that embedding numerical features is\nbeneficial for many backbones, not only for Transformers. Specifically, after\nproper embeddings, simple MLP-like models can perform on par with the\nattention-based architectures. Overall, we highlight embeddings for numerical\nfeatures as an important design aspect with good potential for further\nimprovements in tabular DL.\n","authors":["Yury Gorishniy","Ivan Rubachev","Artem Babenko"],"pdf_url":"https://arxiv.org/pdf/2203.05556v3.pdf","comment":"NeurIPS 2022 camera-ready. Code:\n  https://github.com/yandex-research/tabular-dl-num-embeddings (v3: minor\n  fixes)"},{"id":"http://arxiv.org/abs/2106.11959v3","updated":"2023-07-26T15:57:25Z","published":"2021-06-22T17:58:10Z","title":"Revisiting Deep Learning Models for Tabular Data","summary":"  The existing literature on deep learning for tabular data proposes a wide\nrange of novel architectures and reports competitive results on various\ndatasets. However, the proposed models are usually not properly compared to\neach other and existing works often use different benchmarks and experiment\nprotocols. As a result, it is unclear for both researchers and practitioners\nwhat models perform best. Additionally, the field still lacks effective\nbaselines, that is, the easy-to-use models that provide competitive performance\nacross different problems.\n  In this work, we perform an overview of the main families of DL architectures\nfor tabular data and raise the bar of baselines in tabular DL by identifying\ntwo simple and powerful deep architectures. The first one is a ResNet-like\narchitecture which turns out to be a strong baseline that is often missing in\nprior works. The second model is our simple adaptation of the Transformer\narchitecture for tabular data, which outperforms other solutions on most tasks.\nBoth models are compared to many existing architectures on a diverse set of\ntasks under the same training and tuning protocols. We also compare the best DL\nmodels with Gradient Boosted Decision Trees and conclude that there is still no\nuniversally superior solution.\n","authors":["Yury Gorishniy","Ivan Rubachev","Valentin Khrulkov","Artem Babenko"],"pdf_url":"https://arxiv.org/pdf/2106.11959v3.pdf","comment":"NeurIPS 2021 camera-ready. Code:\n  https://github.com/yandex-research/tabular-dl-revisiting-models (v3: minor\n  fixes)"},{"id":"http://arxiv.org/abs/2211.01549v2","updated":"2023-07-26T15:15:58Z","published":"2022-11-03T01:51:14Z","title":"Client Selection in Federated Learning: Principles, Challenges, and\n  Opportunities","summary":"  As a privacy-preserving paradigm for training Machine Learning (ML) models,\nFederated Learning (FL) has received tremendous attention from both industry\nand academia. In a typical FL scenario, clients exhibit significant\nheterogeneity in terms of data distribution and hardware configurations. Thus,\nrandomly sampling clients in each training round may not fully exploit the\nlocal updates from heterogeneous clients, resulting in lower model accuracy,\nslower convergence rate, degraded fairness, etc. To tackle the FL client\nheterogeneity problem, various client selection algorithms have been developed,\nshowing promising performance improvement. In this paper, we systematically\npresent recent advances in the emerging field of FL client selection and its\nchallenges and research opportunities. We hope to facilitate practitioners in\nchoosing the most suitable client selection mechanisms for their applications,\nas well as inspire researchers and newcomers to better understand this exciting\nresearch topic.\n","authors":["Lei Fu","Huanle Zhang","Ge Gao","Mi Zhang","Xin Liu"],"pdf_url":"https://arxiv.org/pdf/2211.01549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14243v1","updated":"2023-07-26T15:14:10Z","published":"2023-07-26T15:14:10Z","title":"Fluorescent Neuronal Cells v2: Multi-Task, Multi-Format Annotations for\n  Deep Learning in Microscopy","summary":"  Fluorescent Neuronal Cells v2 is a collection of fluorescence microscopy\nimages and the corresponding ground-truth annotations, designed to foster\ninnovative research in the domains of Life Sciences and Deep Learning. This\ndataset encompasses three image collections in which rodent neuronal cells'\nnuclei and cytoplasm are stained with diverse markers to highlight their\nanatomical or functional characteristics. Alongside the images, we provide\nground-truth annotations for several learning tasks, including semantic\nsegmentation, object detection, and counting. The contribution is two-fold.\nFirst, given the variety of annotations and their accessible formats, we\nenvision our work facilitating methodological advancements in computer vision\napproaches for segmentation, detection, feature learning, unsupervised and\nself-supervised learning, transfer learning, and related areas. Second, by\nenabling extensive exploration and benchmarking, we hope Fluorescent Neuronal\nCells v2 will catalyze breakthroughs in fluorescence microscopy analysis and\npromote cutting-edge discoveries in life sciences. The data are available at:\nhttps://amsacta.unibo.it/id/eprint/7347\n","authors":["Luca Clissa","Antonio Macaluso","Roberto Morelli","Alessandra Occhinegro","Emiliana Piscitiello","Ludovico Taddei","Marco Luppi","Roberto Amici","Matteo Cerri","Timna Hitrec","Lorenzo Rinaldi","Antonio Zoccoli"],"pdf_url":"https://arxiv.org/pdf/2307.14243v1.pdf","comment":"11 pages; 5 figures; 2 tables"},{"id":"http://arxiv.org/abs/2307.14237v1","updated":"2023-07-26T15:05:17Z","published":"2023-07-26T15:05:17Z","title":"Evolving Multi-Objective Neural Network Controllers for Robot Swarms","summary":"  Many swarm robotics tasks consist of multiple conflicting objectives. This\nresearch proposes a multi-objective evolutionary neural network approach to\ndeveloping controllers for swarms of robots. The swarm robot controllers are\ntrained in a low-fidelity Python simulator and then tested in a high-fidelity\nsimulated environment using Webots. Simulations are then conducted to test the\nscalability of the evolved multi-objective robot controllers to environments\nwith a larger number of robots. The results presented demonstrate that the\nproposed approach can effectively control each of the robots. The robot swarm\nexhibits different behaviours as the weighting for each objective is adjusted.\nThe results also confirm that multi-objective neural network controllers\nevolved in a low-fidelity simulator can be transferred to high-fidelity\nsimulated environments and that the controllers can scale to environments with\na larger number of robots without further retraining needed.\n","authors":["Karl Mason","Sabine Hauert"],"pdf_url":"https://arxiv.org/pdf/2307.14237v1.pdf","comment":"This paper was presented at the 2023 Autonomous Robots and Multirobot\n  Systems (ARMS) Workshop, at The 22nd International Conference on Autonomous\n  Agents and Multiagent Systems (AAMAS 2023)"},{"id":"http://arxiv.org/abs/2202.12780v3","updated":"2023-07-26T14:55:02Z","published":"2022-02-25T15:52:19Z","title":"Model Comparison and Calibration Assessment: User Guide for Consistent\n  Scoring Functions in Machine Learning and Actuarial Practice","summary":"  One of the main tasks of actuaries and data scientists is to build good\npredictive models for certain phenomena such as the claim size or the number of\nclaims in insurance. These models ideally exploit given feature information to\nenhance the accuracy of prediction. This user guide revisits and clarifies\nstatistical techniques to assess the calibration or adequacy of a model on the\none hand, and to compare and rank different models on the other hand. In doing\nso, it emphasises the importance of specifying the prediction target functional\nat hand a priori (e.g. the mean or a quantile) and of choosing the scoring\nfunction in model comparison in line with this target functional. Guidance for\nthe practical choice of the scoring function is provided. Striving to bridge\nthe gap between science and daily practice in application, it focuses mainly on\nthe pedagogical presentation of existing results and of best practice. The\nresults are accompanied and illustrated by two real data case studies on\nworkers' compensation and customer churn.\n","authors":["Tobias Fissler","Christian Lorentzen","Michael Mayer"],"pdf_url":"https://arxiv.org/pdf/2202.12780v3.pdf","comment":"70 pages, 22 figures"},{"id":"http://arxiv.org/abs/2307.14225v1","updated":"2023-07-26T14:47:15Z","published":"2023-07-26T14:47:15Z","title":"Large Language Models are Competitive Near Cold-start Recommenders for\n  Language- and Item-based Preferences","summary":"  Traditional recommender systems leverage users' item preference history to\nrecommend novel content that users may like. However, modern dialog interfaces\nthat allow users to express language-based preferences offer a fundamentally\ndifferent modality for preference input. Inspired by recent successes of\nprompting paradigms for large language models (LLMs), we study their use for\nmaking recommendations from both item-based and language-based preferences in\ncomparison to state-of-the-art item-based collaborative filtering (CF) methods.\nTo support this investigation, we collect a new dataset consisting of both\nitem-based and language-based preferences elicited from users along with their\nratings on a variety of (biased) recommended items and (unbiased) random items.\nAmong numerous experimental results, we find that LLMs provide competitive\nrecommendation performance for pure language-based preferences (no item\npreferences) in the near cold-start case in comparison to item-based CF\nmethods, despite having no supervised training for this specific task\n(zero-shot) or only a few labels (few-shot). This is particularly promising as\nlanguage-based preference representations are more explainable and scrutable\nthan item-based or vector-based representations.\n","authors":["Scott Sanner","Krisztian Balog","Filip Radlinski","Ben Wedin","Lucas Dixon"],"pdf_url":"https://arxiv.org/pdf/2307.14225v1.pdf","comment":"To appear at RecSys'23"},{"id":"http://arxiv.org/abs/2307.09916v2","updated":"2023-07-26T14:21:08Z","published":"2023-07-19T11:40:15Z","title":"TimeTuner: Diagnosing Time Representations for Time-Series Forecasting\n  with Counterfactual Explanations","summary":"  Deep learning (DL) approaches are being increasingly used for time-series\nforecasting, with many efforts devoted to designing complex DL models. Recent\nstudies have shown that the DL success is often attributed to effective data\nrepresentations, fostering the fields of feature engineering and representation\nlearning. However, automated approaches for feature learning are typically\nlimited with respect to incorporating prior knowledge, identifying interactions\namong variables, and choosing evaluation metrics to ensure that the models are\nreliable. To improve on these limitations, this paper contributes a novel\nvisual analytics framework, namely TimeTuner, designed to help analysts\nunderstand how model behaviors are associated with localized correlations,\nstationarity, and granularity of time-series representations. The system mainly\nconsists of the following two-stage technique: We first leverage counterfactual\nexplanations to connect the relationships among time-series representations,\nmultivariate features and model predictions. Next, we design multiple\ncoordinated views including a partition-based correlation matrix and juxtaposed\nbivariate stripes, and provide a set of interactions that allow users to step\ninto the transformation selection process, navigate through the feature space,\nand reason the model performance. We instantiate TimeTuner with two\ntransformation methods of smoothing and sampling, and demonstrate its\napplicability on real-world time-series forecasting of univariate sunspots and\nmultivariate air pollutants. Feedback from domain experts indicates that our\nsystem can help characterize time-series representations and guide the feature\nengineering processes.\n","authors":["Jianing Hao","Qing Shi","Yilin Ye","Wei Zeng"],"pdf_url":"https://arxiv.org/pdf/2307.09916v2.pdf","comment":"11 pages, 9 figures, this paper has been accepted by IEEE VIS 2023"},{"id":"http://arxiv.org/abs/2307.14208v1","updated":"2023-07-26T14:14:38Z","published":"2023-07-26T14:14:38Z","title":"Online Modeling and Monitoring of Dependent Processes under Resource\n  Constraints","summary":"  Monitoring a population of dependent processes under limited resources is\ncritical for abnormal events detection. A novel online collaborative learning\nmethod is proposed to adaptively allocate the resources for exploitation of\nhigh-risk processes and exploration of dependent dynamics. Efficiency of the\nproposed method is proved through theoretical analysis and experiments.\n","authors":["Tanapol Kosolwattana","Huazheng Wang","Ying Lin"],"pdf_url":"https://arxiv.org/pdf/2307.14208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.10848v2","updated":"2023-07-26T14:08:36Z","published":"2022-05-22T15:13:23Z","title":"Robust Quantity-Aware Aggregation for Federated Learning","summary":"  Federated learning (FL) enables multiple clients to collaboratively train\nmodels without sharing their local data, and becomes an important\nprivacy-preserving machine learning framework. However, classical FL faces\nserious security and robustness problem, e.g., malicious clients can poison\nmodel updates and at the same time claim large quantities to amplify the impact\nof their model updates in the model aggregation. Existing defense methods for\nFL, while all handling malicious model updates, either treat all quantities\nbenign or simply ignore/truncate the quantities of all clients. The former is\nvulnerable to quantity-enhanced attack, while the latter leads to sub-optimal\nperformance since the local data on different clients is usually in\nsignificantly different sizes. In this paper, we propose a robust\nquantity-aware aggregation algorithm for federated learning, called FedRA, to\nperform the aggregation with awareness of local data quantities while being\nable to defend against quantity-enhanced attacks. More specifically, we propose\na method to filter malicious clients by jointly considering the uploaded model\nupdates and data quantities from different clients, and performing\nquantity-aware weighted averaging on model updates from remaining clients.\nMoreover, as the number of malicious clients participating in the federated\nlearning may dynamically change in different rounds, we also propose a\nmalicious client number estimator to predict how many suspicious clients should\nbe filtered in each round. Experiments on four public datasets demonstrate the\neffectiveness of our FedRA method in defending FL against quantity-enhanced\nattacks.\n","authors":["Jingwei Yi","Fangzhao Wu","Huishuai Zhang","Bin Zhu","Tao Qi","Guangzhong Sun","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2205.10848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14199v1","updated":"2023-07-26T13:52:53Z","published":"2023-07-26T13:52:53Z","title":"Application of Random Forest and Support Vector Machine for\n  Investigation of Pressure Filtration Performance, a Zinc Plant Filter Cake\n  Modeling","summary":"  The hydrometallurgical method of zinc production involves leaching zinc from\nore and then separating the solid residue from the liquid solution by pressure\nfiltration. This separation process is very important since the solid residue\ncontains some moisture that can reduce the amount of zinc recovered. This study\nmodeled the pressure filtration process through Random Forest (RF) and Support\nVector Machine (SVM). The models take continuous variables (extracted features)\nfrom the lab samples as inputs. Thus, regression models namely Random Forest\nRegression (RFR) and Support Vector Regression (SVR) were chosen. A total\ndataset was obtained during the pressure filtration process in two conditions:\n1) Polypropylene (S1) and 2) Polyester fabrics (S2). To predict the cake\nmoisture, solids concentration (0.2 and 0.38), temperature (35 and 65\ncentigrade), pH (2, 3.5, and 5), pressure, cake thickness (14, 20, 26, and 34\nmm), air-blow time (2, 10 and 15 min) and filtration time were applied as input\nvariables. The models' predictive accuracy was evaluated by the coefficient of\ndetermination (R2) parameter. The results revealed that the RFR model is\nsuperior to the SVR model for cake moisture prediction.\n","authors":["Masoume Kazemi","Davood Moradkhani","Alireza Abbas Alipour"],"pdf_url":"https://arxiv.org/pdf/2307.14199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14193v1","updated":"2023-07-26T13:47:52Z","published":"2023-07-26T13:47:52Z","title":"Efficient Learning of Discrete-Continuous Computation Graphs","summary":"  Numerous models for supervised and reinforcement learning benefit from\ncombinations of discrete and continuous model components. End-to-end learnable\ndiscrete-continuous models are compositional, tend to generalize better, and\nare more interpretable. A popular approach to building discrete-continuous\ncomputation graphs is that of integrating discrete probability distributions\ninto neural networks using stochastic softmax tricks. Prior work has mainly\nfocused on computation graphs with a single discrete component on each of the\ngraph's execution paths. We analyze the behavior of more complex stochastic\ncomputations graphs with multiple sequential discrete components. We show that\nit is challenging to optimize the parameters of these models, mainly due to\nsmall gradients and local minima. We then propose two new strategies to\novercome these challenges. First, we show that increasing the scale parameter\nof the Gumbel noise perturbations during training improves the learning\nbehavior. Second, we propose dropout residual connections specifically tailored\nto stochastic, discrete-continuous computation graphs. With an extensive set of\nexperiments, we show that we can train complex discrete-continuous models which\none cannot train with standard stochastic softmax tricks. We also show that\ncomplex discrete-stochastic models generalize better than their continuous\ncounterparts on several benchmark datasets.\n","authors":["David Friede","Mathias Niepert"],"pdf_url":"https://arxiv.org/pdf/2307.14193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.04317v2","updated":"2023-07-26T13:43:04Z","published":"2022-03-08T18:07:47Z","title":"MICDIR: Multi-scale Inverse-consistent Deformable Image Registration\n  using UNetMSS with Self-Constructing Graph Latent","summary":"  Image registration is the process of bringing different images into a common\ncoordinate system - a technique widely used in various applications of computer\nvision, such as remote sensing, image retrieval, and, most commonly, medical\nimaging. Deep learning based techniques have been applied successfully to\ntackle various complex medical image processing problems, including medical\nimage registration. Over the years, several image registration techniques have\nbeen proposed using deep learning. Deformable image registration techniques\nsuch as Voxelmorph have been successful in capturing finer changes and\nproviding smoother deformations. However, Voxelmorph, as well as ICNet and\nFIRE, do not explicitly encode global dependencies (i.e. the overall anatomical\nview of the supplied image) and, therefore, cannot track large deformations. In\norder to tackle the aforementioned problems, this paper extends the Voxelmorph\napproach in three different ways. To improve the performance in case of small\nas well as large deformations, supervision of the model at different\nresolutions has been integrated using a multi-scale UNet. To support the\nnetwork to learn and encode the minute structural co-relations of the given\nimage-pairs, a self-constructing graph network (SCGNet) has been used as the\nlatent of the multi-scale UNet - which can improve the learning process of the\nmodel and help the model to generalise better. And finally, to make the\ndeformations inverse-consistent, cycle consistency loss has been employed. On\nthe task of registration of brain MRIs, the proposed method achieved\nsignificant improvements over ANTs and VoxelMorph, obtaining a Dice score of\n0.8013 \\pm 0.0243 for intramodal and 0.6211 \\pm 0.0309 for intermodal, while\nVoxelMorph achieved 0.7747 \\pm 0.0260 and 0.6071 \\pm 0.0510, respectively\n","authors":["Soumick Chatterjee","Himanshi Bajaj","Istiyak H. Siddiquee","Nandish Bandi Subbarayappa","Steve Simon","Suraj Bangalore Shashidhar","Oliver Speck","Andreas Nürnberge"],"pdf_url":"https://arxiv.org/pdf/2203.04317v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14185v1","updated":"2023-07-26T13:24:01Z","published":"2023-07-26T13:24:01Z","title":"A comparison of machine learning surrogate models of street-scale\n  flooding in Norfolk, Virginia","summary":"  Low-lying coastal cities, exemplified by Norfolk, Virginia, face the\nchallenge of street flooding caused by rainfall and tides, which strain\ntransportation and sewer systems and can lead to property damage. While\nhigh-fidelity, physics-based simulations provide accurate predictions of urban\npluvial flooding, their computational complexity renders them unsuitable for\nreal-time applications. Using data from Norfolk rainfall events between 2016\nand 2018, this study compares the performance of a previous surrogate model\nbased on a random forest algorithm with two deep learning models: Long\nShort-Term Memory (LSTM) and Gated Recurrent Unit (GRU). This investigation\nunderscores the importance of using a model architecture that supports the\ncommunication of prediction uncertainty and the effective integration of\nrelevant, multi-modal features.\n","authors":["Diana McSpadden","Steven Goldenberg","Binata Roy","Malachi Schram","Jonathan L. Goodall","Heather Richter"],"pdf_url":"https://arxiv.org/pdf/2307.14185v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2302.14831v2","updated":"2023-07-26T13:20:49Z","published":"2023-02-28T18:28:35Z","title":"FacEDiM: A Face Embedding Distribution Model for Few-Shot Biometric\n  Authentication of Cattle","summary":"  This work proposes to solve the problem of few-shot biometric authentication\nby computing the Mahalanobis distance between testing embeddings and a\nmultivariate Gaussian distribution of training embeddings obtained using\npre-trained CNNs. Experimental results show that models pre-trained on the\nImageNet dataset significantly outperform models pre-trained on human faces.\nWith a VGG16 model, we obtain a FRR of 1.25% for a FAR of 1.18% on a dataset of\n20 cattle identities.\n","authors":["Meshia Cédric Oveneke","Rucha Vaishampayan","Deogratias Lukamba Nsadisa","Jenny Ambukiyenyi Onya"],"pdf_url":"https://arxiv.org/pdf/2302.14831v2.pdf","comment":"4 pages, 1 figure, 1 table, paper accepted at Black In AI at the 36th\n  Conference on Neural Information Processing Systems (NeurIPS 2022), New\n  Orleans, USA"},{"id":"http://arxiv.org/abs/2307.14151v1","updated":"2023-07-26T12:29:58Z","published":"2023-07-26T12:29:58Z","title":"Learning Disentangled Discrete Representations","summary":"  Recent successes in image generation, model-based reinforcement learning, and\ntext-to-image generation have demonstrated the empirical advantages of discrete\nlatent representations, although the reasons behind their benefits remain\nunclear. We explore the relationship between discrete latent spaces and\ndisentangled representations by replacing the standard Gaussian variational\nautoencoder (VAE) with a tailored categorical variational autoencoder. We show\nthat the underlying grid structure of categorical distributions mitigates the\nproblem of rotational invariance associated with multivariate Gaussian\ndistributions, acting as an efficient inductive prior for disentangled\nrepresentations. We provide both analytical and empirical findings that\ndemonstrate the advantages of discrete VAEs for learning disentangled\nrepresentations. Furthermore, we introduce the first unsupervised model\nselection strategy that favors disentangled representations.\n","authors":["David Friede","Christian Reimers","Heiner Stuckenschmidt","Mathias Niepert"],"pdf_url":"https://arxiv.org/pdf/2307.14151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14145v1","updated":"2023-07-26T12:20:52Z","published":"2023-07-26T12:20:52Z","title":"Toward Design of Synthetic Active Inference Agents by Mere Mortals","summary":"  The theoretical properties of active inference agents are impressive, but how\ndo we realize effective agents in working hardware and software on edge\ndevices? This is an interesting problem because the computational load for\npolicy exploration explodes exponentially, while the computational resources\nare very limited for edge devices. In this paper, we discuss the necessary\nfeatures for a software toolbox that supports a competent non-expert engineer\nto develop working active inference agents. We introduce a toolbox-in-progress\nthat aims to accelerate the democratization of active inference agents in a\nsimilar way as TensorFlow propelled applications of deep learning technology.\n","authors":["Bert de Vries"],"pdf_url":"https://arxiv.org/pdf/2307.14145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14138v1","updated":"2023-07-26T12:06:13Z","published":"2023-07-26T12:06:13Z","title":"Piecewise-Stationary Combinatorial Semi-Bandit with Causally Related\n  Rewards","summary":"  We study the piecewise stationary combinatorial semi-bandit problem with\ncausally related rewards. In our nonstationary environment, variations in the\nbase arms' distributions, causal relationships between rewards, or both, change\nthe reward generation process. In such an environment, an optimal\ndecision-maker must follow both sources of change and adapt accordingly. The\nproblem becomes aggravated in the combinatorial semi-bandit setting, where the\ndecision-maker only observes the outcome of the selected bundle of arms. The\ncore of our proposed policy is the Upper Confidence Bound (UCB) algorithm. We\nassume the agent relies on an adaptive approach to overcome the challenge. More\nspecifically, it employs a change-point detector based on the Generalized\nLikelihood Ratio (GLR) test. Besides, we introduce the notion of group restart\nas a new alternative restarting strategy in the decision making process in\nstructured environments. Finally, our algorithm integrates a mechanism to trace\nthe variations of the underlying graph structure, which captures the causal\nrelationships between the rewards in the bandit setting. Theoretically, we\nestablish a regret upper bound that reflects the effects of the number of\nstructural- and distribution changes on the performance. The outcome of our\nnumerical experiments in real-world scenarios exhibits applicability and\nsuperior performance of our proposal compared to the state-of-the-art\nbenchmarks.\n","authors":["Behzad Nourani-Koliji","Steven Bilaj","Amir Rezaei Balef","Setareh Maghsudi"],"pdf_url":"https://arxiv.org/pdf/2307.14138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14134v1","updated":"2023-07-26T12:02:30Z","published":"2023-07-26T12:02:30Z","title":"Developing and Evaluating Tiny to Medium-Sized Turkish BERT Models","summary":"  This study introduces and evaluates tiny, mini, small, and medium-sized\nuncased Turkish BERT models, aiming to bridge the research gap in\nless-resourced languages. We trained these models on a diverse dataset\nencompassing over 75GB of text from multiple sources and tested them on several\ntasks, including mask prediction, sentiment analysis, news classification, and,\nzero-shot classification. Despite their smaller size, our models exhibited\nrobust performance, including zero-shot task, while ensuring computational\nefficiency and faster execution times. Our findings provide valuable insights\ninto the development and application of smaller language models, especially in\nthe context of the Turkish language.\n","authors":["Himmet Toprak Kesgin","Muzaffer Kaan Yuce","Mehmet Fatih Amasyali"],"pdf_url":"https://arxiv.org/pdf/2307.14134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.01226v2","updated":"2023-07-26T11:40:03Z","published":"2023-02-02T17:06:50Z","title":"Factor Fields: A Unified Framework for Neural Fields and Beyond","summary":"  We present Factor Fields, a novel framework for modeling and representing\nsignals. Factor Fields decomposes a signal into a product of factors, each of\nwhich is represented by a neural or regular field representation operating on a\ncoordinate transformed input signal. We show that this decomposition yields a\nunified framework that generalizes several recent signal representations\nincluding NeRF, PlenOxels, EG3D, Instant-NGP, and TensoRF. Moreover, the\nframework allows for the creation of powerful new signal representations, such\nas the Coefficient-Basis Factorization (CoBaFa) which we propose in this paper.\nAs evidenced by our experiments, CoBaFa leads to improvements over previous\nfast reconstruction methods in terms of the three critical goals in neural\nsignal representation: approximation quality, compactness and efficiency.\nExperimentally, we demonstrate that our representation achieves better image\napproximation quality on 2D image regression tasks, higher geometric quality\nwhen reconstructing 3D signed distance fields and higher compactness for\nradiance field reconstruction tasks compared to previous fast reconstruction\nmethods. Besides, our CoBaFa representation enables generalization by sharing\nthe basis across signals during training, enabling generalization tasks such as\nimage regression with sparse observations and few-shot radiance field\nreconstruction. Project Page: https://apchenstu.github.io/FactorFields/\n","authors":["Anpei Chen","Zexiang Xu","Xinyue Wei","Siyu Tang","Hao Su","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2302.01226v2.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.14109v1","updated":"2023-07-26T11:12:55Z","published":"2023-07-26T11:12:55Z","title":"GraphRNN Revisited: An Ablation Study and Extensions for Directed\n  Acyclic Graphs","summary":"  GraphRNN is a deep learning-based architecture proposed by You et al. for\nlearning generative models for graphs. We replicate the results of You et al.\nusing a reproduced implementation of the GraphRNN architecture and evaluate\nthis against baseline models using new metrics. Through an ablation study, we\nfind that the BFS traversal suggested by You et al. to collapse representations\nof isomorphic graphs contributes significantly to model performance.\nAdditionally, we extend GraphRNN to generate directed acyclic graphs by\nreplacing the BFS traversal with a topological sort. We demonstrate that this\nmethod improves significantly over a directed-multiclass variant of GraphRNN on\na real-world dataset.\n","authors":["Taniya Das","Mark Koch","Maya Ravichandran","Nikhil Khatri"],"pdf_url":"https://arxiv.org/pdf/2307.14109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17595v3","updated":"2023-07-26T11:06:32Z","published":"2023-03-30T17:59:02Z","title":"Neglected Free Lunch -- Learning Image Classifiers Using Annotation\n  Byproducts","summary":"  Supervised learning of image classifiers distills human knowledge into a\nparametric model through pairs of images and corresponding labels (X,Y). We\nargue that this simple and widely used representation of human knowledge\nneglects rich auxiliary information from the annotation procedure, such as the\ntime-series of mouse traces and clicks left after image selection. Our insight\nis that such annotation byproducts Z provide approximate human attention that\nweakly guides the model to focus on the foreground cues, reducing spurious\ncorrelations and discouraging shortcut learning. To verify this, we create\nImageNet-AB and COCO-AB. They are ImageNet and COCO training sets enriched with\nsample-wise annotation byproducts, collected by replicating the respective\noriginal annotation tasks. We refer to the new paradigm of training models with\nannotation byproducts as learning using annotation byproducts (LUAB). We show\nthat a simple multitask loss for regressing Z together with Y already improves\nthe generalisability and robustness of the learned models. Compared to the\noriginal supervised learning, LUAB does not require extra annotation costs.\nImageNet-AB and COCO-AB are at https://github.com/naver-ai/NeglectedFreeLunch.\n","authors":["Dongyoon Han","Junsuk Choe","Seonghyeok Chun","John Joon Young Chung","Minsuk Chang","Sangdoo Yun","Jean Y. Song","Seong Joon Oh"],"pdf_url":"https://arxiv.org/pdf/2303.17595v3.pdf","comment":"Code & data at https://github.com/naver-ai/NeglectedFreeLunch. To be\n  presented at ICCV'23"},{"id":"http://arxiv.org/abs/2305.16573v3","updated":"2023-07-26T11:03:59Z","published":"2023-05-26T01:45:19Z","title":"Exploring Weight Balancing on Long-Tailed Recognition Problem","summary":"  Recognition problems in long-tailed data, where the sample size per class is\nheavily skewed, have recently gained importance because the distribution of the\nsample size per class in a dataset is generally exponential unless the sample\nsize is intentionally adjusted. Various approaches have been devised to address\nthese problems. Recently, weight balancing, which combines well-known classical\nregularization techniques with two-stage training, has been proposed. Despite\nits simplicity, it is known for its high performance against existing methods\ndevised in various ways. However, there is a lack of understanding as to why\nthis approach is effective for long-tailed data. In this study, we analyze the\nmethod focusing on neural collapse and cone effect at each training stage and\nfind that it can be decomposed into the increase in Fisher's discriminant ratio\nof the feature extractor caused by weight decay and cross entropy loss and\nimplicit logit adjustment caused by weight decay and class-balanced loss. Our\nanalysis shows that the training method can be further simplified by reducing\nthe number of training stages to one while increasing accuracy.\n","authors":["Naoya Hasegawa","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2305.16573v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.04172v3","updated":"2023-07-26T10:48:54Z","published":"2022-06-08T21:32:50Z","title":"Beyond the Edge of Stability via Two-step Gradient Updates","summary":"  Gradient Descent (GD) is a powerful workhorse of modern machine learning\nthanks to its scalability and efficiency in high-dimensional spaces. Its\nability to find local minimisers is only guaranteed for losses with Lipschitz\ngradients, where it can be seen as a `bona-fide' discretisation of an\nunderlying gradient flow. Yet, many ML setups involving overparametrised models\ndo not fall into this problem class, which has motivated research beyond the\nso-called ``Edge of Stability'' (EoS), where the step-size crosses the\nadmissibility threshold inversely proportional to the Lipschitz constant above.\nPerhaps surprisingly, GD has been empirically observed to still converge\nregardless of local instability and oscillatory behavior.\n  The incipient theoretical analysis of this phenomena has mainly focused in\nthe overparametrised regime, where the effect of choosing a large learning rate\nmay be associated to a `Sharpness-Minimisation' implicit regularisation within\nthe manifold of minimisers, under appropriate asymptotic limits. In contrast,\nin this work we directly examine the conditions for such unstable convergence,\nfocusing on simple, yet representative, learning problems, via analysis of\ntwo-step gradient updates. Specifically, we characterize a local condition\ninvolving third-order derivatives that guarantees existence and convergence to\nfixed points of the two-step updates, and leverage such property in a\nteacher-student setting, under population loss. Finally, starting from Matrix\nFactorization, we provide observations of period-2 orbit of GD in\nhigh-dimensional settings with intuition of its dynamics, along with\nexploration into more general settings.\n","authors":["Lei Chen","Joan Bruna"],"pdf_url":"https://arxiv.org/pdf/2206.04172v3.pdf","comment":"Accepted at ICML 2023. Update: more discussions on Matrix\n  Factorization"},{"id":"http://arxiv.org/abs/2307.06440v2","updated":"2023-07-26T10:33:21Z","published":"2023-07-12T20:10:14Z","title":"No Train No Gain: Revisiting Efficient Training Algorithms For\n  Transformer-based Language Models","summary":"  The computation necessary for training Transformer-based language models has\nskyrocketed in recent years. This trend has motivated research on efficient\ntraining algorithms designed to improve training, validation, and downstream\nperformance faster than standard training. In this work, we revisit three\ncategories of such algorithms: dynamic architectures (layer stacking, layer\ndropping), batch selection (selective backprop, RHO loss), and efficient\noptimizers (Lion, Sophia). When pre-training BERT and T5 with a fixed\ncomputation budget using such methods, we find that their training, validation,\nand downstream gains vanish compared to a baseline with a fully-decayed\nlearning rate. We define an evaluation protocol that enables computation to be\ndone on arbitrary machines by mapping all computation time to a reference\nmachine which we call reference system time. We discuss the limitations of our\nproposed protocol and release our code to encourage rigorous research in\nefficient training procedures: https://github.com/JeanKaddour/NoTrainNoGain.\n","authors":["Jean Kaddour","Oscar Key","Piotr Nawrot","Pasquale Minervini","Matt J. Kusner"],"pdf_url":"https://arxiv.org/pdf/2307.06440v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14085v1","updated":"2023-07-26T10:24:17Z","published":"2023-07-26T10:24:17Z","title":"Actions Speak What You Want: Provably Sample-Efficient Reinforcement\n  Learning of the Quantal Stackelberg Equilibrium from Strategic Feedbacks","summary":"  We study reinforcement learning (RL) for learning a Quantal Stackelberg\nEquilibrium (QSE) in an episodic Markov game with a leader-follower structure.\nIn specific, at the outset of the game, the leader announces her policy to the\nfollower and commits to it. The follower observes the leader's policy and, in\nturn, adopts a quantal response policy by solving an entropy-regularized policy\noptimization problem induced by leader's policy. The goal of the leader is to\nfind her optimal policy, which yields the optimal expected total return, by\ninteracting with the follower and learning from data. A key challenge of this\nproblem is that the leader cannot observe the follower's reward, and needs to\ninfer the follower's quantal response model from his actions against leader's\npolicies. We propose sample-efficient algorithms for both the online and\noffline settings, in the context of function approximation. Our algorithms are\nbased on (i) learning the quantal response model via maximum likelihood\nestimation and (ii) model-free or model-based RL for solving the leader's\ndecision making problem, and we show that they achieve sublinear regret upper\nbounds. Moreover, we quantify the uncertainty of these estimators and leverage\nthe uncertainty to implement optimistic and pessimistic algorithms for online\nand offline settings. Besides, when specialized to the linear and myopic\nsetting, our algorithms are also computationally efficient. Our theoretical\nanalysis features a novel performance-difference lemma which incorporates the\nerror of quantal response model, which might be of independent interest.\n","authors":["Siyu Chen","Mengdi Wang","Zhuoran Yang"],"pdf_url":"https://arxiv.org/pdf/2307.14085v1.pdf","comment":"129 pages, 1 figure"},{"id":"http://arxiv.org/abs/2304.04660v2","updated":"2023-07-26T10:06:06Z","published":"2023-04-10T15:36:54Z","title":"Uncertainty-driven Trajectory Truncation for Data Augmentation in\n  Offline Reinforcement Learning","summary":"  Equipped with the trained environmental dynamics, model-based offline\nreinforcement learning (RL) algorithms can often successfully learn good\npolicies from fixed-sized datasets, even some datasets with poor quality.\nUnfortunately, however, it can not be guaranteed that the generated samples\nfrom the trained dynamics model are reliable (e.g., some synthetic samples may\nlie outside of the support region of the static dataset). To address this\nissue, we propose Trajectory Truncation with Uncertainty (TATU), which\nadaptively truncates the synthetic trajectory if the accumulated uncertainty\nalong the trajectory is too large. We theoretically show the performance bound\nof TATU to justify its benefits. To empirically show the advantages of TATU, we\nfirst combine it with two classical model-based offline RL algorithms, MOPO and\nCOMBO. Furthermore, we integrate TATU with several off-the-shelf model-free\noffline RL algorithms, e.g., BCQ. Experimental results on the D4RL benchmark\nshow that TATU significantly improves their performance, often by a large\nmargin. Code is available here.\n","authors":["Junjie Zhang","Jiafei Lyu","Xiaoteng Ma","Jiangpeng Yan","Jun Yang","Le Wan","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2304.04660v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09469v2","updated":"2023-07-26T10:03:53Z","published":"2023-07-10T18:08:05Z","title":"Graph Representation of the Magnetic Field Topology in High-Fidelity\n  Plasma Simulations for Machine Learning Applications","summary":"  Topological analysis of the magnetic field in simulated plasmas allows the\nstudy of various physical phenomena in a wide range of settings. One such\napplication is magnetic reconnection, a phenomenon related to the dynamics of\nthe magnetic field topology, which is difficult to detect and characterize in\nthree dimensions. We propose a scalable pipeline for topological data analysis\nand spatiotemporal graph representation of three-dimensional magnetic vector\nfields. We demonstrate our methods on simulations of the Earth's magnetosphere\nproduced by Vlasiator, a supercomputer-scale Vlasov theory-based simulation for\nnear-Earth space. The purpose of this work is to challenge the machine learning\ncommunity to explore graph-based machine learning approaches to address a\nlargely open scientific problem with wide-ranging potential impact.\n","authors":["Ioanna Bouri","Fanni Franssila","Markku Alho","Giulia Cozzani","Ivan Zaitsev","Minna Palmroth","Teemu Roos"],"pdf_url":"https://arxiv.org/pdf/2307.09469v2.pdf","comment":"6 pages, 3 figures, Accepted at the ICML 2023 Workshop on Machine\n  Learning for Astrophysics"},{"id":"http://arxiv.org/abs/2307.14068v1","updated":"2023-07-26T09:40:19Z","published":"2023-07-26T09:40:19Z","title":"Dynamic Domain Discrepancy Adjustment for Active Multi-Domain Adaptation","summary":"  Multi-source unsupervised domain adaptation (MUDA) aims to transfer knowledge\nfrom related source domains to an unlabeled target domain. While recent MUDA\nmethods have shown promising results, most focus on aligning the overall\nfeature distributions across source domains, which can lead to negative effects\ndue to redundant features within each domain. Moreover, there is a significant\nperformance gap between MUDA and supervised methods. To address these\nchallenges, we propose a novel approach called Dynamic Domain Discrepancy\nAdjustment for Active Multi-Domain Adaptation (D3AAMDA). Firstly, we establish\na multi-source dynamic modulation mechanism during the training process based\non the degree of distribution differences between source and target domains.\nThis mechanism controls the alignment level of features between each source\ndomain and the target domain, effectively leveraging the local advantageous\nfeature information within the source domains. Additionally, we propose a\nMulti-source Active Boundary Sample Selection (MABS) strategy, which utilizes a\nguided dynamic boundary loss to design an efficient query function for\nselecting important samples. This strategy achieves improved generalization to\nthe target domain with minimal sampling costs. We extensively evaluate our\nproposed method on commonly used domain adaptation datasets, comparing it\nagainst existing UDA and ADA methods. The experimental results unequivocally\ndemonstrate the superiority of our approach.\n","authors":["Long Liu","Bo Zhou","Zhipeng Zhao","Zening Liu"],"pdf_url":"https://arxiv.org/pdf/2307.14068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14067v1","updated":"2023-07-26T09:34:34Z","published":"2023-07-26T09:34:34Z","title":"Machine Learning Applications In Healthcare: The State Of Knowledge and\n  Future Directions","summary":"  Detection of easily missed hidden patterns with fast processing power makes\nmachine learning (ML) indispensable to today's healthcare system. Though many\nML applications have already been discovered and many are still under\ninvestigation, only a few have been adopted by current healthcare systems. As a\nresult, there exists an enormous opportunity in healthcare system for ML but\ndistributed information, scarcity of properly arranged and easily explainable\ndocumentation in related sector are major impede which are making ML\napplications difficult to healthcare professionals. This study aimed to gather\nML applications in different areas of healthcare concisely and more effectively\nso that necessary information can be accessed immediately with relevant\nreferences. We divided our study into five major groups: community level work,\nrisk management/ preventive care, healthcare operation management, remote care,\nand early detection. Dividing these groups into subgroups, we provided relevant\nreferences with description in tabular form for quick access. Our objective is\nto inform people about ML applicability in healthcare industry, reduce the\nknowledge gap of clinicians about the ML applications and motivate healthcare\nprofessionals towards more machine learning based healthcare system.\n","authors":["Mrinmoy Roy","Sarwar J. Minar","Porarthi Dhar","A T M Omor Faruq"],"pdf_url":"https://arxiv.org/pdf/2307.14067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14066v1","updated":"2023-07-26T09:33:24Z","published":"2023-07-26T09:33:24Z","title":"Pre-Training with Diffusion models for Dental Radiography segmentation","summary":"  Medical radiography segmentation, and specifically dental radiography, is\nhighly limited by the cost of labeling which requires specific expertise and\nlabor-intensive annotations. In this work, we propose a straightforward\npre-training method for semantic segmentation leveraging Denoising Diffusion\nProbabilistic Models (DDPM), which have shown impressive results for generative\nmodeling. Our straightforward approach achieves remarkable performance in terms\nof label efficiency and does not require architectural modifications between\npre-training and downstream tasks. We propose to first pre-train a Unet by\nexploiting the DDPM training objective, and then fine-tune the resulting model\non a segmentation task. Our experimental results on the segmentation of dental\nradiographs demonstrate that the proposed method is competitive with\nstate-of-the-art pre-training methods.\n","authors":["Jérémy Rousseau","Christian Alaka","Emma Covili","Hippolyte Mayard","Laura Misrachi","Willy Au"],"pdf_url":"https://arxiv.org/pdf/2307.14066v1.pdf","comment":"13 pages, 6 figures, Deep Generative Models workshop @ MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.14025v1","updated":"2023-07-26T08:14:18Z","published":"2023-07-26T08:14:18Z","title":"Topologically-Regularized Multiple Instance Learning for Red Blood Cell\n  Disease Classification","summary":"  Diagnosing rare anemia disorders using microscopic images is challenging for\nskilled specialists and machine-learning methods alike. Due to thousands of\ndisease-relevant cells in a single blood sample, this constitutes a complex\nmultiple-instance learning (MIL) problem. While the spatial neighborhood of red\nblood cells is not meaningful per se, the topology, i.e., the geometry of blood\nsamples as a whole, contains informative features to remedy typical MIL issues,\nsuch as vanishing gradients and overfitting when training on limited data. We\nthus develop a topology-based approach that extracts multi-scale topological\nfeatures from bags of single red blood cell images. The topological features\nare used to regularize the model, enforcing the preservation of characteristic\ntopological properties of the data. Applied to a dataset of 71 patients\nsuffering from rare anemia disorders with 521 microscopic images of red blood\ncells, our experiments show that topological regularization is an effective\nmethod that leads to more than 3% performance improvements for the automated\nclassification of rare anemia disorders based on single-cell images. This is\nthe first approach that uses topological properties for regularizing the MIL\nprocess.\n","authors":["Salome Kazeminia","Ario Sadafi","Asya Makhro","Anna Bogdanova","Carsten Marr","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2307.14025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14023v1","updated":"2023-07-26T08:07:37Z","published":"2023-07-26T08:07:37Z","title":"Are Transformers with One Layer Self-Attention Using Low-Rank Weight\n  Matrices Universal Approximators?","summary":"  Existing analyses of the expressive capacity of Transformer models have\nrequired excessively deep layers for data memorization, leading to a\ndiscrepancy with the Transformers actually used in practice. This is primarily\ndue to the interpretation of the softmax function as an approximation of the\nhardmax function. By clarifying the connection between the softmax function and\nthe Boltzmann operator, we prove that a single layer of self-attention with\nlow-rank weight matrices possesses the capability to perfectly capture the\ncontext of an entire input sequence. As a consequence, we show that\nsingle-layer Transformer has a memorization capacity for finite samples, and\nthat Transformers consisting of one self-attention layer with two feed-forward\nneural networks are universal approximators for continuous functions on a\ncompact domain.\n","authors":["Tokio Kajitsuka","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2307.14023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14012v1","updated":"2023-07-26T07:50:41Z","published":"2023-07-26T07:50:41Z","title":"MCMC-Correction of Score-Based Diffusion Models for Model Composition","summary":"  Diffusion models can be parameterised in terms of either a score or an energy\nfunction. The energy parameterisation has better theoretical properties, mainly\nthat it enables an extended sampling procedure with a Metropolis--Hastings\ncorrection step, based on the change in total energy in the proposed samples.\nHowever, it seems to yield slightly worse performance, and more importantly,\ndue to the widespread popularity of score-based diffusion, there are limited\navailability of off-the-shelf pre-trained energy-based ones. This limitation\nundermines the purpose of model composition, which aims to combine pre-trained\nmodels to sample from new distributions. Our proposal, however, suggests\nretaining the score parameterization and instead computing the energy-based\nacceptance probability through line integration of the score function. This\nallows us to re-use existing diffusion models and still combine the reverse\nprocess with various Markov-Chain Monte Carlo (MCMC) methods. We evaluate our\nmethod on a 2D experiment and find that it achieve similar or arguably better\nperformance than the energy parameterisation.\n","authors":["Anders Sjöberg","Jakob Lindqvist","Magnus Önnheim","Mats Jirstrand","Lennart Svensson"],"pdf_url":"https://arxiv.org/pdf/2307.14012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13996v1","updated":"2023-07-26T07:08:03Z","published":"2023-07-26T07:08:03Z","title":"Fast algorithms for k-submodular maximization subject to a matroid\n  constraint","summary":"  In this paper, we apply a Threshold-Decreasing Algorithm to maximize\n$k$-submodular functions under a matroid constraint, which reduces the query\ncomplexity of the algorithm compared to the greedy algorithm with little loss\nin approximation ratio. We give a $(\\frac{1}{2} - \\epsilon)$-approximation\nalgorithm for monotone $k$-submodular function maximization, and a\n$(\\frac{1}{3} - \\epsilon)$-approximation algorithm for non-monotone case, with\ncomplexity $O(\\frac{n(k\\cdot EO + IO)}{\\epsilon} \\log \\frac{r}{\\epsilon})$,\nwhere $r$ denotes the rank of the matroid, and $IO, EO$ denote the number of\noracles to evaluate whether a subset is an independent set and to compute the\nfunction value of $f$, respectively. Since the constraint of total size can be\nlooked as a special matroid, called uniform matroid, then we present the fast\nalgorithm for maximizing $k$-submodular functions subject to a total size\nconstraint as corollaries. corollaries.\n","authors":["Shuxian Niu","Qian Liu","Yang Zhou","Min Li"],"pdf_url":"https://arxiv.org/pdf/2307.13996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13995v1","updated":"2023-07-26T07:07:27Z","published":"2023-07-26T07:07:27Z","title":"Take Your Pick: Enabling Effective Personalized Federated Learning\n  within Low-dimensional Feature Space","summary":"  Personalized federated learning (PFL) is a popular framework that allows\nclients to have different models to address application scenarios where\nclients' data are in different domains. The typical model of a client in PFL\nfeatures a global encoder trained by all clients to extract universal features\nfrom the raw data and personalized layers (e.g., a classifier) trained using\nthe client's local data. Nonetheless, due to the differences between the data\ndistributions of different clients (aka, domain gaps), the universal features\nproduced by the global encoder largely encompass numerous components irrelevant\nto a certain client's local task. Some recent PFL methods address the above\nproblem by personalizing specific parameters within the encoder. However, these\nmethods encounter substantial challenges attributed to the high dimensionality\nand non-linearity of neural network parameter space. In contrast, the feature\nspace exhibits a lower dimensionality, providing greater intuitiveness and\ninterpretability as compared to the parameter space. To this end, we propose a\nnovel PFL framework named FedPick. FedPick achieves PFL in the low-dimensional\nfeature space by selecting task-relevant features adaptively for each client\nfrom the features generated by the global encoder based on its local data\ndistribution. It presents a more accessible and interpretable implementation of\nPFL compared to those methods working in the parameter space. Extensive\nexperimental results show that FedPick could effectively select task-relevant\nfeatures for each client and improve model performance in cross-domain FL.\n","authors":["Guogang Zhu","Xuefeng Liu","Shaojie Tang","Jianwei Niu","Xinghao Wu","Jiaxing Shen"],"pdf_url":"https://arxiv.org/pdf/2307.13995v1.pdf","comment":"13 pages, 13 figures"},{"id":"http://arxiv.org/abs/2307.13994v1","updated":"2023-07-26T07:07:03Z","published":"2023-07-26T07:07:03Z","title":"BovineTalk: Machine Learning for Vocalization Analysis of Dairy Cattle\n  under Negative Affective States","summary":"  There is a critical need to develop and validate non-invasive animal-based\nindicators of affective states in livestock species, in order to integrate them\ninto on-farm assessment protocols, potentially via the use of precision\nlivestock farming (PLF) tools. One such promising approach is the use of vocal\nindicators. The acoustic structure of vocalizations and their functions were\nextensively studied in important livestock species, such as pigs, horses,\npoultry and goats, yet cattle remain understudied in this context to date. Cows\nwere shown to produce two types vocalizations: low-frequency calls (LF),\nproduced with the mouth closed, or partially closed, for close distance\ncontacts and open mouth emitted high-frequency calls (HF), produced for long\ndistance communication, with the latter considered to be largely associated\nwith negative affective states. Moreover, cattle vocalizations were shown to\ncontain information on individuality across a wide range of contexts, both\nnegative and positive. Nowadays, dairy cows are facing a series of negative\nchallenges and stressors in a typical production cycle, making vocalizations\nduring negative affective states of special interest for research. One\ncontribution of this study is providing the largest to date pre-processed\n(clean from noises) dataset of lactating adult multiparous dairy cows during\nnegative affective states induced by visual isolation challenges. Here we\npresent two computational frameworks - deep learning based and explainable\nmachine learning based, to classify high and low-frequency cattle calls, and\nindividual cow voice recognition. Our models in these two frameworks reached\n87.2% and 89.4% accuracy for LF and HF classification, with 68.9% and 72.5%\naccuracy rates for the cow individual identification, respectively.\n","authors":["Dinu Gavojdian","Teddy Lazebnik","Madalina Mincu","Ariel Oren","Ioana Nicolae","Anna Zamansky"],"pdf_url":"https://arxiv.org/pdf/2307.13994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13991v1","updated":"2023-07-26T06:58:19Z","published":"2023-07-26T06:58:19Z","title":"METAVerse: Meta-Learning Traversability Cost Map for Off-Road Navigation","summary":"  Autonomous navigation in off-road conditions requires an accurate estimation\nof terrain traversability. However, traversability estimation in unstructured\nenvironments is subject to high uncertainty due to the variability of numerous\nfactors that influence vehicle-terrain interaction. Consequently, it is\nchallenging to obtain a generalizable model that can accurately predict\ntraversability in a variety of environments. This paper presents METAVerse, a\nmeta-learning framework for learning a global model that accurately and\nreliably predicts terrain traversability across diverse environments. We train\nthe traversability prediction network to generate a dense and continuous-valued\ncost map from a sparse LiDAR point cloud, leveraging vehicle-terrain\ninteraction feedback in a self-supervised manner. Meta-learning is utilized to\ntrain a global model with driving data collected from multiple environments,\neffectively minimizing estimation uncertainty. During deployment, online\nadaptation is performed to rapidly adapt the network to the local environment\nby exploiting recent interaction experiences. To conduct a comprehensive\nevaluation, we collect driving data from various terrains and demonstrate that\nour method can obtain a global model that minimizes uncertainty. Moreover, by\nintegrating our model with a model predictive controller, we demonstrate that\nthe reduced uncertainty results in safe and stable navigation in unstructured\nand unknown terrains.\n","authors":["Junwon Seo","Taekyung Kim","Seongyong Ahn","Kiho Kwak"],"pdf_url":"https://arxiv.org/pdf/2307.13991v1.pdf","comment":"Our video can be found at https://youtu.be/4rIAMM1ZKMo"},{"id":"http://arxiv.org/abs/2307.13989v1","updated":"2023-07-26T06:54:31Z","published":"2023-07-26T06:54:31Z","title":"This is not correct! Negation-aware Evaluation of Language Generation\n  Systems","summary":"  Large language models underestimate the impact of negations on how much they\nchange the meaning of a sentence. Therefore, learned evaluation metrics based\non these models are insensitive to negations. In this paper, we propose\nNegBLEURT, a negation-aware version of the BLEURT evaluation metric. For that,\nwe designed a rule-based sentence negation tool and used it to create the\nCANNOT negation evaluation dataset. Based on this dataset, we fine-tuned a\nsentence transformer and an evaluation metric to improve their negation\nsensitivity. Evaluating these models on existing benchmarks shows that our\nfine-tuned models outperform existing metrics on the negated sentences by far\nwhile preserving their base models' performances on other perturbations.\n","authors":["Miriam Anschütz","Diego Miguel Lozano","Georg Groh"],"pdf_url":"https://arxiv.org/pdf/2307.13989v1.pdf","comment":"Accepted to INLG 2023"},{"id":"http://arxiv.org/abs/2307.13978v1","updated":"2023-07-26T06:34:24Z","published":"2023-07-26T06:34:24Z","title":"Controlling the Latent Space of GANs through Reinforcement Learning: A\n  Case Study on Task-based Image-to-Image Translation","summary":"  Generative Adversarial Networks (GAN) have emerged as a formidable AI tool to\ngenerate realistic outputs based on training datasets. However, the challenge\nof exerting control over the generation process of GANs remains a significant\nhurdle. In this paper, we propose a novel methodology to address this issue by\nintegrating a reinforcement learning (RL) agent with a latent-space GAN\n(l-GAN), thereby facilitating the generation of desired outputs. More\nspecifically, we have developed an actor-critic RL agent with a meticulously\ndesigned reward policy, enabling it to acquire proficiency in navigating the\nlatent space of the l-GAN and generating outputs based on specified tasks. To\nsubstantiate the efficacy of our approach, we have conducted a series of\nexperiments employing the MNIST dataset, including arithmetic addition as an\nillustrative task. The outcomes of these experiments serve to validate our\nmethodology. Our pioneering integration of an RL agent with a GAN model\nrepresents a novel advancement, holding great potential for enhancing\ngenerative networks in the future.\n","authors":["Mahyar Abbasian","Taha Rajabzadeh","Ahmadreza Moradipari","Seyed Amir Hossein Aqajari","Hongsheng Lu","Amir Rahmani"],"pdf_url":"https://arxiv.org/pdf/2307.13978v1.pdf","comment":"7 pages, 7 figures, 2 tables, conference paper"},{"id":"http://arxiv.org/abs/2307.13962v1","updated":"2023-07-26T05:29:29Z","published":"2023-07-26T05:29:29Z","title":"Understanding Deep Neural Networks via Linear Separability of Hidden\n  Layers","summary":"  In this paper, we measure the linear separability of hidden layer outputs to\nstudy the characteristics of deep neural networks. In particular, we first\npropose Minkowski difference based linear separability measures (MD-LSMs) to\nevaluate the linear separability degree of two points sets. Then, we\ndemonstrate that there is a synchronicity between the linear separability\ndegree of hidden layer outputs and the network training performance, i.e., if\nthe updated weights can enhance the linear separability degree of hidden layer\noutputs, the updated network will achieve a better training performance, and\nvice versa. Moreover, we study the effect of activation function and network\nsize (including width and depth) on the linear separability of hidden layers.\nFinally, we conduct the numerical experiments to validate our findings on some\npopular deep networks including multilayer perceptron (MLP), convolutional\nneural network (CNN), deep belief network (DBN), ResNet, VGGNet, AlexNet,\nvision transformer (ViT) and GoogLeNet.\n","authors":["Chao Zhang","Xinyu Chen","Wensheng Li","Lixue Liu","Wei Wu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2307.13962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1808.00560v9","updated":"2023-07-26T04:30:49Z","published":"2018-08-01T20:55:54Z","title":"Compressible Spectral Mixture Kernels with Sparse Dependency Structures\n  for Gaussian Processes","summary":"  Spectral mixture (SM) kernels comprise a powerful class of generalized\nkernels for Gaussian processes (GPs) to describe complex patterns. This paper\nintroduces model compression and time- and phase (TP) modulated dependency\nstructures to the original (SM) kernel for improved generalization of GPs.\nSpecifically, by adopting Bienaym\\'es identity, we generalize the dependency\nstructure through cross-covariance between the SM components. Then, we propose\na novel SM kernel with a dependency structure (SMD) by using cross-convolution\nbetween the SM components. Furthermore, we ameliorate the expressiveness of the\ndependency structure by parameterizing it with time and phase delays. The\ndependency structure has clear interpretations in terms of spectral density,\ncovariance behavior, and sampling path. To enrich the SMD with effective\nhyperparameter initialization, compressible SM kernel components, and sparse\ndependency structures, we introduce a novel structure adaptation (SA) algorithm\nin the end. A thorough comparative analysis of the SMD on both synthetic and\nreal-life applications corroborates its efficacy.\n","authors":["Kai Chen","Yijue Dai","Feng Yin","Elena Marchiori","Sergios Theodoridis"],"pdf_url":"https://arxiv.org/pdf/1808.00560v9.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2307.13421v2","updated":"2023-07-26T04:05:10Z","published":"2023-07-25T11:40:47Z","title":"On the Learning Dynamics of Attention Networks","summary":"  Attention models are typically learned by optimizing one of three standard\nloss functions that are variously called -- soft attention, hard attention, and\nlatent variable marginal likelihood (LVML) attention. All three paradigms are\nmotivated by the same goal of finding two models -- a `focus' model that\n`selects' the right \\textit{segment} of the input and a `classification' model\nthat processes the selected segment into the target label. However, they differ\nsignificantly in the way the selected segments are aggregated, resulting in\ndistinct dynamics and final results. We observe a unique signature of models\nlearned using these paradigms and explain this as a consequence of the\nevolution of the classification model under gradient descent when the focus\nmodel is fixed. We also analyze these paradigms in a simple setting and derive\nclosed-form expressions for the parameter trajectory under gradient flow. With\nthe soft attention loss, the focus model improves quickly at initialization and\nsplutters later on. On the other hand, hard attention loss behaves in the\nopposite fashion. Based on our observations, we propose a simple hybrid\napproach that combines the advantages of the different loss functions and\ndemonstrates it on a collection of semi-synthetic and real-world datasets\n","authors":["Rahul Vashisht","Harish G. Ramaswamy"],"pdf_url":"https://arxiv.org/pdf/2307.13421v2.pdf","comment":"Preprint: Accepted at ECAI-2023"},{"id":"http://arxiv.org/abs/2307.10569v2","updated":"2023-07-26T03:57:03Z","published":"2023-07-20T04:14:09Z","title":"Deceptive Alignment Monitoring","summary":"  As the capabilities of large machine learning models continue to grow, and as\nthe autonomy afforded to such models continues to expand, the spectre of a new\nadversary looms: the models themselves. The threat that a model might behave in\na seemingly reasonable manner, while secretly and subtly modifying its behavior\nfor ulterior reasons is often referred to as deceptive alignment in the AI\nSafety & Alignment communities. Consequently, we call this new direction\nDeceptive Alignment Monitoring. In this work, we identify emerging directions\nin diverse machine learning subfields that we believe will become increasingly\nimportant and intertwined in the near future for deceptive alignment\nmonitoring, and we argue that advances in these fields present both long-term\nchallenges and new research opportunities. We conclude by advocating for\ngreater involvement by the adversarial machine learning community in these\nemerging directions.\n","authors":["Andres Carranza","Dhruv Pai","Rylan Schaeffer","Arnuv Tandon","Sanmi Koyejo"],"pdf_url":"https://arxiv.org/pdf/2307.10569v2.pdf","comment":"Accepted as BlueSky Oral to 2023 ICML AdvML Workshop"},{"id":"http://arxiv.org/abs/2307.13944v1","updated":"2023-07-26T03:55:08Z","published":"2023-07-26T03:55:08Z","title":"Entropy Neural Estimation for Graph Contrastive Learning","summary":"  Contrastive learning on graphs aims at extracting distinguishable high-level\nrepresentations of nodes. In this paper, we theoretically illustrate that the\nentropy of a dataset can be approximated by maximizing the lower bound of the\nmutual information across different views of a graph, \\ie, entropy is estimated\nby a neural network. Based on this finding, we propose a simple yet effective\nsubset sampling strategy to contrast pairwise representations between views of\na dataset. In particular, we randomly sample nodes and edges from a given graph\nto build the input subset for a view. Two views are fed into a parameter-shared\nSiamese network to extract the high-dimensional embeddings and estimate the\ninformation entropy of the entire graph. For the learning process, we propose\nto optimize the network using two objectives, simultaneously. Concretely, the\ninput of the contrastive loss function consists of positive and negative pairs.\nOur selection strategy of pairs is different from previous works and we present\na novel strategy to enhance the representation ability of the graph encoder by\nselecting nodes based on cross-view similarities. We enrich the diversity of\nthe positive and negative pairs by selecting highly similar samples and totally\ndifferent data with the guidance of cross-view similarity scores, respectively.\nWe also introduce a cross-view consistency constraint on the representations\ngenerated from the different views. This objective guarantees the learned\nrepresentations are consistent across views from the perspective of the entire\ngraph. We conduct extensive experiments on seven graph benchmarks, and the\nproposed approach achieves competitive performance compared to the current\nstate-of-the-art methods. The source code will be publicly released once this\npaper is accepted.\n","authors":["Yixuan Ma","Xiaolin Zhang","Peng Zhang","Kun Zhan"],"pdf_url":"https://arxiv.org/pdf/2307.13944v1.pdf","comment":"ACM MM 2023 accepted"},{"id":"http://arxiv.org/abs/2209.03143v2","updated":"2023-07-26T03:52:36Z","published":"2022-09-07T13:40:08Z","title":"AudioLM: a Language Modeling Approach to Audio Generation","summary":"  We introduce AudioLM, a framework for high-quality audio generation with\nlong-term consistency. AudioLM maps the input audio to a sequence of discrete\ntokens and casts audio generation as a language modeling task in this\nrepresentation space. We show how existing audio tokenizers provide different\ntrade-offs between reconstruction quality and long-term structure, and we\npropose a hybrid tokenization scheme to achieve both objectives. Namely, we\nleverage the discretized activations of a masked language model pre-trained on\naudio to capture long-term structure and the discrete codes produced by a\nneural audio codec to achieve high-quality synthesis. By training on large\ncorpora of raw audio waveforms, AudioLM learns to generate natural and coherent\ncontinuations given short prompts. When trained on speech, and without any\ntranscript or annotation, AudioLM generates syntactically and semantically\nplausible speech continuations while also maintaining speaker identity and\nprosody for unseen speakers. Furthermore, we demonstrate how our approach\nextends beyond speech by generating coherent piano music continuations, despite\nbeing trained without any symbolic representation of music.\n","authors":["Zalán Borsos","Raphaël Marinier","Damien Vincent","Eugene Kharitonov","Olivier Pietquin","Matt Sharifi","Dominik Roblek","Olivier Teboul","David Grangier","Marco Tagliasacchi","Neil Zeghidour"],"pdf_url":"https://arxiv.org/pdf/2209.03143v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13943v1","updated":"2023-07-26T03:48:37Z","published":"2023-07-26T03:48:37Z","title":"Topology-aware Robust Optimization for Out-of-distribution\n  Generalization","summary":"  Out-of-distribution (OOD) generalization is a challenging machine learning\nproblem yet highly desirable in many high-stake applications. Existing methods\nsuffer from overly pessimistic modeling with low generalization confidence. As\ngeneralizing to arbitrary test distributions is impossible, we hypothesize that\nfurther structure on the topology of distributions is crucial in developing\nstrong OOD resilience. To this end, we propose topology-aware robust\noptimization (TRO) that seamlessly integrates distributional topology in a\nprincipled optimization framework. More specifically, TRO solves two\noptimization objectives: (1) Topology Learning which explores data manifold to\nuncover the distributional topology; (2) Learning on Topology which exploits\nthe topology to constrain robust optimization for tightly-bounded\ngeneralization risks. We theoretically demonstrate the effectiveness of our\napproach and empirically show that it significantly outperforms the state of\nthe arts in a wide range of tasks including classification, regression, and\nsemantic segmentation. Moreover, we empirically find the data-driven\ndistributional topology is consistent with domain knowledge, enhancing the\nexplainability of our approach.\n","authors":["Fengchun Qiao","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2307.13943v1.pdf","comment":"In ICLR 2023 (17 pages including appendix). The source code and\n  pre-trained models are publicly available at: https://github.com/joffery/TRO"},{"id":"http://arxiv.org/abs/2307.13494v2","updated":"2023-07-26T03:41:25Z","published":"2023-07-25T13:42:22Z","title":"Duet: efficient and scalable hybriD neUral rElation undersTanding","summary":"  Learned cardinality estimation methods have achieved high precision compared\nto traditional methods. Among learned methods, query-driven approaches face the\ndata and workload drift problem for a long time. Although both query-driven and\nhybrid methods are proposed to avoid this problem, even the state-of-art of\nthem suffer from high training and estimation costs, limited scalability,\ninstability, and long-tailed distribution problem on high cardinality and high\ndimensional tables, which seriously affects the practical application of\nlearned cardinality estimators. In this paper, we prove that most of these\nproblems are directly caused by the widely used progressive sampling. We solve\nthis problem by introducing predicates into the autoregressive model and\npropose Duet, a stable, efficient, and scalable hybrid method to estimate\ncardinality directly without sampling or any non-differentiable process, which\ncan not only reduces the inference complexity from $O(n)$ to $O(1)$ compared to\nNaru and UAE but also achieve higher accuracy on high cardinality and high\ndimensional tables. Experimental results show that Duet can achieve all the\ndesign goals above and be much more practical and even has a lower inference\ncost on CPU than that of most learned methods on GPU.\n","authors":["Kaixin Zhang","Hongzhi Wang","Yabin Lu","Ziqi Li","Chang Shu","Yu Yan","Donghua Yang"],"pdf_url":"https://arxiv.org/pdf/2307.13494v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05915v2","updated":"2023-07-26T03:32:36Z","published":"2023-07-12T04:44:31Z","title":"Prompt Generate Train (PGT): Few-shot Domain Adaption of Retrieval\n  Augmented Generation Models for Open Book Question-Answering","summary":"  We propose a framework - Prompt, Generate, Train (PGT) - to efficiently\ndevelop a generative question-answering model for open-book question-answering\nover a proprietary collection of text documents. The framework adapts a\nretriever augmented generation (RAG) model to the target domain using\nsupervised fine-tuning and reinforcement learning with synthetic feedback in a\nfew-shot setting. This, we hypothesize, will yield an aligned, uncertainty\ncalibrated model that is competitive with GPT-4 based in-context retrieval\naugmented generation in generating relevant answers at lower serving costs. The\nframework's synthetic generation pipeline will generate synthetic training data\ncomprising <passage, question, answer> tuples using an open-source LLM and a\nnovel consistency filtering scheme. The pipeline will be designed to generate\nboth abstractive and extractive questions that span the entire corpus. The\nframework proposes to fine-tune a smaller RAG model comprising a dense\nretriever (ColBERTv2) and a smaller sized LLM on the synthetic dataset. In\nparallel, the framework will train a Reward model to score domain grounded\nanswers higher than hallucinated answers using an a priori relevance ordering\nof synthetically assembled samples. In the next phase, the framework will align\nthe RAG model with the target domain using reinforcement learning (Proximal\nPolicy Optimization). This step may improve the RAG model's ability to generate\ngrounded answers and ignore out of domain questions. In the final phase, the\nframework will calibrate the model's uncertainty for extractive\nquestion-answers.\n","authors":["C. S. Krishna"],"pdf_url":"https://arxiv.org/pdf/2307.05915v2.pdf","comment":"10"},{"id":"http://arxiv.org/abs/2307.13938v1","updated":"2023-07-26T03:30:28Z","published":"2023-07-26T03:30:28Z","title":"Improving Semi-Supervised Semantic Segmentation with Dual-Level Siamese\n  Structure Network","summary":"  Semi-supervised semantic segmentation (SSS) is an important task that\nutilizes both labeled and unlabeled data to reduce expenses on labeling\ntraining examples. However, the effectiveness of SSS algorithms is limited by\nthe difficulty of fully exploiting the potential of unlabeled data. To address\nthis, we propose a dual-level Siamese structure network (DSSN) for pixel-wise\ncontrastive learning. By aligning positive pairs with a pixel-wise contrastive\nloss using strong augmented views in both low-level image space and high-level\nfeature space, the proposed DSSN is designed to maximize the utilization of\navailable unlabeled data. Additionally, we introduce a novel class-aware\npseudo-label selection strategy for weak-to-strong supervision, which addresses\nthe limitations of most existing methods that do not perform selection or apply\na predefined threshold for all classes. Specifically, our strategy selects the\ntop high-confidence prediction of the weak view for each class to generate\npseudo labels that supervise the strong augmented views. This strategy is\ncapable of taking into account the class imbalance and improving the\nperformance of long-tailed classes. Our proposed method achieves\nstate-of-the-art results on two datasets, PASCAL VOC 2012 and Cityscapes,\noutperforming other SSS algorithms by a significant margin.\n","authors":["Zhibo Tain","Xiaolin Zhang","Peng Zhang","Kun Zhan"],"pdf_url":"https://arxiv.org/pdf/2307.13938v1.pdf","comment":"ACM MM 2023 accpeted"},{"id":"http://arxiv.org/abs/2205.13619v5","updated":"2023-07-26T03:20:47Z","published":"2022-05-26T20:48:53Z","title":"Fairness in Recommendation: Foundations, Methods and Applications","summary":"  As one of the most pervasive applications of machine learning, recommender\nsystems are playing an important role on assisting human decision making. The\nsatisfaction of users and the interests of platforms are closely related to the\nquality of the generated recommendation results. However, as a highly\ndata-driven system, recommender system could be affected by data or algorithmic\nbias and thus generate unfair results, which could weaken the reliance of the\nsystems. As a result, it is crucial to address the potential unfairness\nproblems in recommendation settings. Recently, there has been growing attention\non fairness considerations in recommender systems with more and more literature\non approaches to promote fairness in recommendation. However, the studies are\nrather fragmented and lack a systematic organization, thus making it difficult\nto penetrate for new researchers to the domain. This motivates us to provide a\nsystematic survey of existing works on fairness in recommendation. This survey\nfocuses on the foundations for fairness in recommendation literature. It first\npresents a brief introduction about fairness in basic machine learning tasks\nsuch as classification and ranking in order to provide a general overview of\nfairness research, as well as introduce the more complex situations and\nchallenges that need to be considered when studying fairness in recommender\nsystems. After that, the survey will introduce fairness in recommendation with\na focus on the taxonomies of current fairness definitions, the typical\ntechniques for improving fairness, as well as the datasets for fairness studies\nin recommendation. The survey also talks about the challenges and opportunities\nin fairness research with the hope of promoting the fair recommendation\nresearch area and beyond.\n","authors":["Yunqi Li","Hanxiong Chen","Shuyuan Xu","Yingqiang Ge","Juntao Tan","Shuchang Liu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2205.13619v5.pdf","comment":"38 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.13924v1","updated":"2023-07-26T02:45:59Z","published":"2023-07-26T02:45:59Z","title":"trajdata: A Unified Interface to Multiple Human Trajectory Datasets","summary":"  The field of trajectory forecasting has grown significantly in recent years,\npartially owing to the release of numerous large-scale, real-world human\ntrajectory datasets for autonomous vehicles (AVs) and pedestrian motion\ntracking. While such datasets have been a boon for the community, they each use\ncustom and unique data formats and APIs, making it cumbersome for researchers\nto train and evaluate methods across multiple datasets. To remedy this, we\npresent trajdata: a unified interface to multiple human trajectory datasets. At\nits core, trajdata provides a simple, uniform, and efficient representation and\nAPI for trajectory and map data. As a demonstration of its capabilities, in\nthis work we conduct a comprehensive empirical evaluation of existing\ntrajectory datasets, providing users with a rich understanding of the data\nunderpinning much of current pedestrian and AV motion forecasting research, and\nproposing suggestions for future datasets from these insights. trajdata is\npermissively licensed (Apache 2.0) and can be accessed online at\nhttps://github.com/NVlabs/trajdata\n","authors":["Boris Ivanovic","Guanyu Song","Igor Gilitschenski","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2307.13924v1.pdf","comment":"15 pages, 15 figures, 3 tables"},{"id":"http://arxiv.org/abs/2307.13918v1","updated":"2023-07-26T02:34:57Z","published":"2023-07-26T02:34:57Z","title":"Simulation-based Inference for Cardiovascular Models","summary":"  Over the past decades, hemodynamics simulators have steadily evolved and have\nbecome tools of choice for studying cardiovascular systems in-silico. While\nsuch tools are routinely used to simulate whole-body hemodynamics from\nphysiological parameters, solving the corresponding inverse problem of mapping\nwaveforms back to plausible physiological parameters remains both promising and\nchallenging. Motivated by advances in simulation-based inference (SBI), we cast\nthis inverse problem as statistical inference. In contrast to alternative\napproaches, SBI provides \\textit{posterior distributions} for the parameters of\ninterest, providing a \\textit{multi-dimensional} representation of uncertainty\nfor \\textit{individual} measurements. We showcase this ability by performing an\nin-silico uncertainty analysis of five biomarkers of clinical interest\ncomparing several measurement modalities. Beyond the corroboration of known\nfacts, such as the feasibility of estimating heart rate, our study highlights\nthe potential of estimating new biomarkers from standard-of-care measurements.\nSBI reveals practically relevant findings that cannot be captured by standard\nsensitivity analyses, such as the existence of sub-populations for which\nparameter estimation exhibits distinct uncertainty regimes. Finally, we study\nthe gap between in-vivo and in-silico with the MIMIC-III waveform database and\ncritically discuss how cardiovascular simulations can inform real-world data\nanalysis.\n","authors":["Antoine Wehenkel","Jens Behrmann","Andrew C. Miller","Guillermo Sapiro","Ozan Sener","Marco Cuturi","Jörn-Henrik Jacobsen"],"pdf_url":"https://arxiv.org/pdf/2307.13918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13917v1","updated":"2023-07-26T02:34:13Z","published":"2023-07-26T02:34:13Z","title":"BayesDAG: Gradient-Based Posterior Sampling for Causal Discovery","summary":"  Bayesian causal discovery aims to infer the posterior distribution over\ncausal models from observed data, quantifying epistemic uncertainty and\nbenefiting downstream tasks. However, computational challenges arise due to\njoint inference over combinatorial space of Directed Acyclic Graphs (DAGs) and\nnonlinear functions. Despite recent progress towards efficient posterior\ninference over DAGs, existing methods are either limited to variational\ninference on node permutation matrices for linear causal models, leading to\ncompromised inference accuracy, or continuous relaxation of adjacency matrices\nconstrained by a DAG regularizer, which cannot ensure resulting graphs are\nDAGs. In this work, we introduce a scalable Bayesian causal discovery framework\nbased on stochastic gradient Markov Chain Monte Carlo (SG-MCMC) that overcomes\nthese limitations. Our approach directly samples DAGs from the posterior\nwithout requiring any DAG regularization, simultaneously draws function\nparameter samples and is applicable to both linear and nonlinear causal models.\nTo enable our approach, we derive a novel equivalence to the permutation-based\nDAG learning, which opens up possibilities of using any relaxed gradient\nestimator defined over permutations. To our knowledge, this is the first\nframework applying gradient-based MCMC sampling for causal discovery. Empirical\nevaluations on synthetic and real-world datasets demonstrate our approach's\neffectiveness compared to state-of-the-art baselines.\n","authors":["Yashas Annadani","Nick Pawlowski","Joel Jennings","Stefan Bauer","Cheng Zhang","Wenbo Gong"],"pdf_url":"https://arxiv.org/pdf/2307.13917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13916v1","updated":"2023-07-26T02:33:54Z","published":"2023-07-26T02:33:54Z","title":"Online learning in bandits with predicted context","summary":"  We consider the contextual bandit problem where at each time, the agent only\nhas access to a noisy version of the context and the error variance (or an\nestimator of this variance). This setting is motivated by a wide range of\napplications where the true context for decision-making is unobserved, and only\na prediction of the context by a potentially complex machine learning algorithm\nis available. When the context error is non-diminishing, classical bandit\nalgorithms fail to achieve sublinear regret. We propose the first online\nalgorithm in this setting with sublinear regret compared to the appropriate\nbenchmark. The key idea is to extend the measurement error model in classical\nstatistics to the online decision-making setting, which is nontrivial due to\nthe policy being dependent on the noisy context observations.\n","authors":["Yongyi Guo","Susan Murphy"],"pdf_url":"https://arxiv.org/pdf/2307.13916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02115v2","updated":"2023-07-26T02:20:30Z","published":"2023-06-03T14:01:54Z","title":"Table and Image Generation for Investigating Knowledge of Entities in\n  Pre-trained Vision and Language Models","summary":"  In this paper, we propose a table and image generation task to verify how the\nknowledge about entities acquired from natural language is retained in Vision &\nLanguage (V&L) models. This task consists of two parts: the first is to\ngenerate a table containing knowledge about an entity and its related image,\nand the second is to generate an image from an entity with a caption and a\ntable containing related knowledge of the entity. In both tasks, the model must\nknow the entities used to perform the generation properly. We created the\nWikipedia Table and Image Generation (WikiTIG) dataset from about 200,000\ninfoboxes in English Wikipedia articles to perform the proposed tasks. We\nevaluated the performance on the tasks with respect to the above research\nquestion using the V&L model OFA, which has achieved state-of-the-art results\nin multiple tasks. Experimental results show that OFA forgets part of its\nentity knowledge by pre-training as a complement to improve the performance of\nimage related tasks.\n","authors":["Hidetaka Kamigaito","Katsuhiko Hayashi","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2306.02115v2.pdf","comment":"Accepted at ACL 2023"},{"id":"http://arxiv.org/abs/2307.13909v1","updated":"2023-07-26T02:18:04Z","published":"2023-07-26T02:18:04Z","title":"Graph Neural Networks-based Hybrid Framework For Predicting Particle\n  Crushing Strength","summary":"  Graph Neural Networks have emerged as an effective machine learning tool for\nmulti-disciplinary tasks such as pharmaceutical molecule classification and\nchemical reaction prediction, because they can model non-euclidean\nrelationships between different entities. Particle crushing, as a significant\nfield of civil engineering, describes the breakage of granular materials caused\nby the breakage of particle fragment bonds under the modeling of numerical\nsimulations, which motivates us to characterize the mechanical behaviors of\nparticle crushing through the connectivity of particle fragments with Graph\nNeural Networks (GNNs). However, there lacks an open-source large-scale\nparticle crushing dataset for research due to the expensive costs of laboratory\ntests or numerical simulations. Therefore, we firstly generate a dataset with\n45,000 numerical simulations and 900 particle types to facilitate the research\nprogress of machine learning for particle crushing. Secondly, we devise a\nhybrid framework based on GNNs to predict particle crushing strength in a\nparticle fragment view with the advances of state of the art GNNs. Finally, we\ncompare our hybrid framework against traditional machine learning methods and\nthe plain MLP to verify its effectiveness. The usefulness of different features\nis further discussed through the gradient attribution explanation w.r.t the\npredictions. Our data and code are released at\nhttps://github.com/doujiang-zheng/GNN-For-Particle-Crushing.\n","authors":["Tongya Zheng","Tianli Zhang","Qingzheng Guan","Wenjie Huang","Zunlei Feng","Mingli Song","Chun Chen"],"pdf_url":"https://arxiv.org/pdf/2307.13909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13907v1","updated":"2023-07-26T02:15:11Z","published":"2023-07-26T02:15:11Z","title":"Robustness Verification of Deep Neural Networks using Star-Based\n  Reachability Analysis with Variable-Length Time Series Input","summary":"  Data-driven, neural network (NN) based anomaly detection and predictive\nmaintenance are emerging research areas. NN-based analytics of time-series data\noffer valuable insights into past behaviors and estimates of critical\nparameters like remaining useful life (RUL) of equipment and state-of-charge\n(SOC) of batteries. However, input time series data can be exposed to\nintentional or unintentional noise when passing through sensors, necessitating\nrobust validation and verification of these NNs. This paper presents a case\nstudy of the robustness verification approach for time series regression NNs\n(TSRegNN) using set-based formal methods. It focuses on utilizing\nvariable-length input data to streamline input manipulation and enhance network\narchitecture generalizability. The method is applied to two data sets in the\nPrognostics and Health Management (PHM) application areas: (1) SOC estimation\nof a Lithium-ion battery and (2) RUL estimation of a turbine engine. The NNs'\nrobustness is checked using star-based reachability analysis, and several\nperformance measures evaluate the effect of bounded perturbations in the input\non network outputs, i.e., future outcomes. Overall, the paper offers a\ncomprehensive case study for validating and verifying NN-based analytics of\ntime-series data in real-world applications, emphasizing the importance of\nrobustness testing for accurate and reliable predictions, especially\nconsidering the impact of noise on future outcomes.\n","authors":["Neelanjana Pal","Diego Manzanas Lopez","Taylor T Johnson"],"pdf_url":"https://arxiv.org/pdf/2307.13907v1.pdf","comment":"Under Review, 26 Pages, 14 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.13903v1","updated":"2023-07-26T02:02:19Z","published":"2023-07-26T02:02:19Z","title":"Corruption-Robust Lipschitz Contextual Search","summary":"  I study the problem of learning a Lipschitz function with corrupted binary\nsignals. The learner tries to learn a Lipschitz function $f$ that the adversary\nchooses. In each round, the adversary selects a context vector $x_t$ in the\ninput space, and the learner makes a guess to the true function value $f(x_t)$\nand receives a binary signal indicating whether the guess was high or low. In a\ntotal of $C$ rounds, the signal may be corrupted, though the value of $C$ is\nunknown to the learner. The learner's goal is to incur a small cumulative loss.\nI present a natural yet powerful technique sanity check, which proves useful in\ndesigning corruption-robust algorithms. I design algorithms which (treating the\nLipschitz parameter $L$ as constant): for the symmetric loss, the learner\nachieves regret $O(C\\log T)$ with $d = 1$ and $O_d(C\\log T + T^{(d-1)/d})$ with\n$d > 1$; for the pricing loss the learner achieves regret $\\widetilde{O}\n(T^{d/(d+1)} + C\\cdot T^{1/(d+1)})$.\n","authors":["Shiliang Zuo"],"pdf_url":"https://arxiv.org/pdf/2307.13903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13899v1","updated":"2023-07-26T01:47:49Z","published":"2023-07-26T01:47:49Z","title":"Regularizing Neural Networks with Meta-Learning Generative Models","summary":"  This paper investigates methods for improving generative data augmentation\nfor deep learning. Generative data augmentation leverages the synthetic samples\nproduced by generative models as an additional dataset for classification with\nsmall dataset settings. A key challenge of generative data augmentation is that\nthe synthetic data contain uninformative samples that degrade accuracy. This is\nbecause the synthetic samples do not perfectly represent class categories in\nreal data and uniform sampling does not necessarily provide useful samples for\ntasks. In this paper, we present a novel strategy for generative data\naugmentation called meta generative regularization (MGR). To avoid the\ndegradation of generative data augmentation, MGR utilizes synthetic samples in\nthe regularization term for feature extractors instead of in the loss function,\ne.g., cross-entropy. These synthetic samples are dynamically determined to\nminimize the validation losses through meta-learning. We observed that MGR can\navoid the performance degradation of na\\\"ive generative data augmentation and\nboost the baselines. Experiments on six datasets showed that MGR is effective\nparticularly when datasets are smaller and stably outperforms baselines.\n","authors":["Shin'ya Yamaguchi","Daiki Chijiwa","Sekitoshi Kanai","Atsutoshi Kumagai","Hisashi Kashima"],"pdf_url":"https://arxiv.org/pdf/2307.13899v1.pdf","comment":"Accepted to Data-centric Machine Learning Research (DMLR) Workshop at\n  ICML 2023"},{"id":"http://arxiv.org/abs/2206.02789v2","updated":"2023-07-26T01:23:54Z","published":"2022-06-06T00:28:37Z","title":"Efficient and Accurate Physics-aware Multiplex Graph Neural Networks for\n  3D Small Molecules and Macromolecule Complexes","summary":"  Recent advances in applying Graph Neural Networks (GNNs) to molecular science\nhave showcased the power of learning three-dimensional (3D) structure\nrepresentations with GNNs. However, most existing GNNs suffer from the\nlimitations of insufficient modeling of diverse interactions, computational\nexpensive operations, and ignorance of vectorial values. Here, we tackle these\nlimitations by proposing a novel GNN model, Physics-aware Multiplex Graph\nNeural Network (PaxNet), to efficiently and accurately learn the\nrepresentations of 3D molecules for both small organic compounds and\nmacromolecule complexes. PaxNet separates the modeling of local and non-local\ninteractions inspired by molecular mechanics, and reduces the expensive\nangle-related computations. Besides scalar properties, PaxNet can also predict\nvectorial properties by learning an associated vector for each atom. To\nevaluate the performance of PaxNet, we compare it with state-of-the-art\nbaselines in two tasks. On small molecule dataset for predicting quantum\nchemical properties, PaxNet reduces the prediction error by 15% and uses 73%\nless memory than the best baseline. On macromolecule dataset for predicting\nprotein-ligand binding affinities, PaxNet outperforms the best baseline while\nreducing the memory consumption by 33% and the inference time by 85%. Thus,\nPaxNet provides a universal, robust and accurate method for large-scale machine\nlearning of molecules. Our code is available at\nhttps://github.com/zetayue/Physics-aware-Multiplex-GNN.\n","authors":["Shuo Zhang","Yang Liu","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2206.02789v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2307.13885v1","updated":"2023-07-26T01:10:29Z","published":"2023-07-26T01:10:29Z","title":"Efficient Estimation of the Local Robustness of Machine Learning Models","summary":"  Machine learning models often need to be robust to noisy input data. The\neffect of real-world noise (which is often random) on model predictions is\ncaptured by a model's local robustness, i.e., the consistency of model\npredictions in a local region around an input. However, the na\\\"ive approach to\ncomputing local robustness based on Monte-Carlo sampling is statistically\ninefficient, leading to prohibitive computational costs for large-scale\napplications. In this work, we develop the first analytical estimators to\nefficiently compute local robustness of multi-class discriminative models using\nlocal linear function approximation and the multivariate Normal CDF. Through\nthe derivation of these estimators, we show how local robustness is connected\nto concepts such as randomized smoothing and softmax probability. We also\nconfirm empirically that these estimators accurately and efficiently compute\nthe local robustness of standard deep learning models. In addition, we\ndemonstrate these estimators' usefulness for various tasks involving local\nrobustness, such as measuring robustness bias and identifying examples that are\nvulnerable to noise perturbation in a dataset. By developing these analytical\nestimators, this work not only advances conceptual understanding of local\nrobustness, but also makes its computation practical, enabling the use of local\nrobustness in critical downstream applications.\n","authors":["Tessa Han","Suraj Srinivas","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2307.13885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13883v1","updated":"2023-07-26T01:07:52Z","published":"2023-07-26T01:07:52Z","title":"ExeDec: Execution Decomposition for Compositional Generalization in\n  Neural Program Synthesis","summary":"  When writing programs, people have the ability to tackle a new complex task\nby decomposing it into smaller and more familiar subtasks. While it is\ndifficult to measure whether neural program synthesis methods have similar\ncapabilities, we can measure whether they compositionally generalize, that is,\nwhether a model that has been trained on the simpler subtasks is subsequently\nable to solve more complex tasks. In this paper, we characterize several\ndifferent forms of compositional generalization that are desirable in program\nsynthesis, forming a meta-benchmark which we use to create generalization tasks\nfor two popular datasets, RobustFill and DeepCoder. We then propose ExeDec, a\nnovel decomposition-based synthesis strategy that predicts execution subgoals\nto solve problems step-by-step informed by program execution at each step.\nExeDec has better synthesis performance and greatly improved compositional\ngeneralization ability compared to baselines.\n","authors":["Kensen Shi","Joey Hong","Manzil Zaheer","Pengcheng Yin","Charles Sutton"],"pdf_url":"https://arxiv.org/pdf/2307.13883v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2204.03758"},{"id":"http://arxiv.org/abs/2111.09885v3","updated":"2023-07-26T00:56:50Z","published":"2021-11-18T18:59:35Z","title":"Rate-optimal Bayesian Simple Regret in Best Arm Identification","summary":"  We consider best arm identification in the multi-armed bandit problem.\nAssuming certain continuity conditions of the prior, we characterize the rate\nof the Bayesian simple regret. Differing from Bayesian regret minimization\n(Lai, 1987), the leading term in the Bayesian simple regret derives from the\nregion where the gap between optimal and suboptimal arms is smaller than\n$\\sqrt{\\frac{\\log T}{T}}$. We propose a simple and easy-to-compute algorithm\nwith its leading term matching with the lower bound up to a constant factor;\nsimulation results support our theoretical findings.\n","authors":["Junpei Komiyama","Kaito Ariu","Masahiro Kato","Chao Qin"],"pdf_url":"https://arxiv.org/pdf/2111.09885v3.pdf","comment":"To appear in Mathematics of Operations Research. Changed the title\n  from the previous version"},{"id":"http://arxiv.org/abs/2203.01482v2","updated":"2023-07-26T00:49:29Z","published":"2022-03-03T01:53:47Z","title":"MetaDT: Meta Decision Tree with Class Hierarchy for Interpretable\n  Few-Shot Learning","summary":"  Few-Shot Learning (FSL) is a challenging task, which aims to recognize novel\nclasses with few examples. Recently, lots of methods have been proposed from\nthe perspective of meta-learning and representation learning. However, few\nworks focus on the interpretability of FSL decision process. In this paper, we\ntake a step towards the interpretable FSL by proposing a novel meta-learning\nbased decision tree framework, namely, MetaDT. In particular, the FSL\ninterpretability is achieved from two aspects, i.e., a concept aspect and a\nvisual aspect. On the concept aspect, we first introduce a tree-like concept\nhierarchy as FSL prior. Then, resorting to the prior, we split each few-shot\ntask to a set of subtasks with different concept levels and then perform class\nprediction via a model of decision tree. The advantage of such design is that a\nsequence of high-level concept decisions that lead up to a final class\nprediction can be obtained, which clarifies the FSL decision process. On the\nvisual aspect, a set of subtask-specific classifiers with visual attention\nmechanism is designed to perform decision at each node of the decision tree. As\na result, a subtask-specific heatmap visualization can be obtained to achieve\nthe decision interpretability of each tree node. At last, to alleviate the data\nscarcity issue of FSL, we regard the prior of concept hierarchy as an\nundirected graph, and then design a graph convolution-based decision tree\ninference network as our meta-learner to infer parameters of the decision tree.\nExtensive experiments on performance comparison and interpretability analysis\nshow superiority of our MetaDT.\n","authors":["Baoquan Zhang","Hao Jiang","Xutao Li","Shanshan Feng","Yunming Ye","Rui Ye"],"pdf_url":"https://arxiv.org/pdf/2203.01482v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2210.13763v3","updated":"2023-07-26T00:26:26Z","published":"2022-10-25T04:46:30Z","title":"Teal: Learning-Accelerated Optimization of WAN Traffic Engineering","summary":"  The rapid expansion of global cloud wide-area networks (WANs) has posed a\nchallenge for commercial optimization engines to efficiently solve network\ntraffic engineering (TE) problems at scale. Existing acceleration strategies\ndecompose TE optimization into concurrent subproblems but realize limited\nparallelism due to an inherent tradeoff between run time and allocation\nperformance.\n  We present Teal, a learning-based TE algorithm that leverages the parallel\nprocessing power of GPUs to accelerate TE control. First, Teal designs a\nflow-centric graph neural network (GNN) to capture WAN connectivity and network\nflows, learning flow features as inputs to downstream allocation. Second, to\nreduce the problem scale and make learning tractable, Teal employs a\nmulti-agent reinforcement learning (RL) algorithm to independently allocate\neach traffic demand while optimizing a central TE objective. Finally, Teal\nfine-tunes allocations with ADMM (Alternating Direction Method of Multipliers),\na highly parallelizable optimization algorithm for reducing constraint\nviolations such as overutilized links.\n  We evaluate Teal using traffic matrices from Microsoft's WAN. On a large WAN\ntopology with >1,700 nodes, Teal generates near-optimal flow allocations while\nrunning several orders of magnitude faster than the production optimization\nengine. Compared with other TE acceleration schemes, Teal satisfies 6--32% more\ntraffic demand and yields 197--625x speedups.\n","authors":["Zhiying Xu","Francis Y. Yan","Rachee Singh","Justin T. Chiu","Alexander M. Rush","Minlan Yu"],"pdf_url":"https://arxiv.org/pdf/2210.13763v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14989v2","updated":"2023-07-26T00:13:38Z","published":"2023-04-28T17:15:40Z","title":"Kullback-Leibler Maillard Sampling for Multi-armed Bandits with Bounded\n  Rewards","summary":"  We study $K$-armed bandit problems where the reward distributions of the arms\nare all supported on the $[0,1]$ interval. It has been a challenge to design\nregret-efficient randomized exploration algorithms in this setting. Maillard\nsampling~\\cite{maillard13apprentissage}, an attractive alternative to Thompson\nsampling, has recently been shown to achieve competitive regret guarantees in\nthe sub-Gaussian reward setting~\\cite{bian2022maillard} while maintaining\nclosed-form action probabilities, which is useful for offline policy\nevaluation. In this work, we propose the Kullback-Leibler Maillard Sampling\n(KL-MS) algorithm, a natural extension of Maillard sampling for achieving\nKL-style gap-dependent regret bound. We show that KL-MS enjoys the asymptotic\noptimality when the rewards are Bernoulli and has a worst-case regret bound of\nthe form $O(\\sqrt{\\mu^*(1-\\mu^*) K T \\ln K} + K \\ln T)$, where $\\mu^*$ is the\nexpected reward of the optimal arm, and $T$ is the time horizon length.\n","authors":["Hao Qin","Kwang-Sung Jun","Chicheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.14989v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13869v1","updated":"2023-07-26T00:01:21Z","published":"2023-07-26T00:01:21Z","title":"Good Lattice Training: Physics-Informed Neural Networks Accelerated by\n  Number Theory","summary":"  Physics-informed neural networks (PINNs) offer a novel and efficient approach\nto solving partial differential equations (PDEs). Their success lies in the\nphysics-informed loss, which trains a neural network to satisfy a given PDE at\nspecific points and to approximate the solution. However, the solutions to PDEs\nare inherently infinite-dimensional, and the distance between the output and\nthe solution is defined by an integral over the domain. Therefore, the\nphysics-informed loss only provides a finite approximation, and selecting\nappropriate collocation points becomes crucial to suppress the discretization\nerrors, although this aspect has often been overlooked. In this paper, we\npropose a new technique called good lattice training (GLT) for PINNs, inspired\nby number theoretic methods for numerical analysis. GLT offers a set of\ncollocation points that are effective even with a small number of points and\nfor multi-dimensional spaces. Our experiments demonstrate that GLT requires\n2--20 times fewer collocation points (resulting in lower computational cost)\nthan uniformly random sampling or Latin hypercube sampling, while achieving\ncompetitive performance.\n","authors":["Takashi Matsubara","Takaharu Yaguchi"],"pdf_url":"https://arxiv.org/pdf/2307.13869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13868v1","updated":"2023-07-26T00:01:16Z","published":"2023-07-26T00:01:16Z","title":"Learning sources of variability from high-dimensional observational\n  studies","summary":"  Causal inference studies whether the presence of a variable influences an\nobserved outcome. As measured by quantities such as the \"average treatment\neffect,\" this paradigm is employed across numerous biological fields, from\nvaccine and drug development to policy interventions. Unfortunately, the\nmajority of these methods are often limited to univariate outcomes. Our work\ngeneralizes causal estimands to outcomes with any number of dimensions or any\nmeasurable space, and formulates traditional causal estimands for nominal\nvariables as causal discrepancy tests. We propose a simple technique for\nadjusting universally consistent conditional independence tests and prove that\nthese tests are universally consistent causal discrepancy tests. Numerical\nexperiments illustrate that our method, Causal CDcorr, leads to improvements in\nboth finite sample validity and power when compared to existing strategies. Our\nmethods are all open source and available at github.com/ebridge2/cdcorr.\n","authors":["Eric W. Bridgeford","Jaewon Chung","Brian Gilbert","Sambit Panda","Adam Li","Cencheng Shen","Alexandra Badea","Brian Caffo","Joshua T. Vogelstein"],"pdf_url":"https://arxiv.org/pdf/2307.13868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05946v3","updated":"2023-07-26T23:29:51Z","published":"2023-07-12T06:23:31Z","title":"A Bayesian approach to quantifying uncertainties and improving\n  generalizability in traffic prediction models","summary":"  Deep-learning models for traffic data prediction can have superior\nperformance in modeling complex functions using a multi-layer architecture.\nHowever, a major drawback of these approaches is that most of these approaches\ndo not offer forecasts with uncertainty estimates, which are essential for\ntraffic operations and control. Without uncertainty estimates, it is difficult\nto place any level of trust to the model predictions, and operational\nstrategies relying on overconfident predictions can lead to worsening traffic\nconditions. In this study, we propose a Bayesian recurrent neural network\nframework for uncertainty quantification in traffic prediction with higher\ngeneralizability by introducing spectral normalization to its hidden layers. In\nour paper, we have shown that normalization alters the training process of deep\nneural networks by controlling the model's complexity and reducing the risk of\noverfitting to the training data. This, in turn, helps improve the\ngeneralization performance of the model on out-of-distribution datasets.\nResults demonstrate that spectral normalization improves uncertainty estimates\nand significantly outperforms both the layer normalization and model without\nnormalization in single-step prediction horizons. This improved performance can\nbe attributed to the ability of spectral normalization to better localize the\nfeature space of the data under perturbations. Our findings are especially\nrelevant to traffic management applications, where predicting traffic\nconditions across multiple locations is the goal, but the availability of\ntraining data from multiple locations is limited. Spectral normalization,\ntherefore, provides a more generalizable approach that can effectively capture\nthe underlying patterns in traffic data without requiring location-specific\nmodels.\n","authors":["Agnimitra Sengupta","Sudeepta Mondal","Adway Das","S. Ilgin Guler"],"pdf_url":"https://arxiv.org/pdf/2307.05946v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.11264v3","updated":"2023-07-26T23:25:26Z","published":"2021-06-21T17:08:09Z","title":"Compositional federated learning: Applications in distributionally\n  robust averaging and meta learning","summary":"  In the paper, we propose an effective and efficient Compositional Federated\nLearning (ComFedL) algorithm for solving a new compositional Federated Learning\n(FL) framework, which frequently appears in many data mining and machine\nlearning problems with a hierarchical structure such as distributionally robust\nFL and model-agnostic meta learning (MAML). Moreover, we study the convergence\nanalysis of our ComFedL algorithm under some mild conditions, and prove that it\nachieves a convergence rate of $O(\\frac{1}{\\sqrt{T}})$, where $T$ denotes the\nnumber of iteration. To the best of our knowledge, our new Compositional FL\nframework is the first work to bridge federated learning with composition\nstochastic optimization. In particular, we first transform the distributionally\nrobust FL (i.e., a minimax optimization problem) into a simple composition\noptimization problem by using KL divergence regularization. At the same time,\nwe also first transform the distribution-agnostic MAML problem (i.e., a minimax\noptimization problem) into a simple yet effective composition optimization\nproblem. Finally, we apply two popular machine learning tasks, i.e.,\ndistributionally robust FL and MAML to demonstrate the effectiveness of our\nalgorithm.\n","authors":["Feihu Huang","Junyi Li"],"pdf_url":"https://arxiv.org/pdf/2106.11264v3.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2209.10825v3","updated":"2023-07-26T23:09:35Z","published":"2022-09-22T07:12:48Z","title":"Nonsmooth Nonconvex-Nonconcave Minimax Optimization: Primal-Dual\n  Balancing and Iteration Complexity Analysis","summary":"  Nonconvex-nonconcave minimax optimization has gained widespread interest over\nthe last decade. However, most existing works focus on variants of gradient\ndescent-ascent (GDA) algorithms, which are only applicable to smooth\nnonconvex-concave settings. To address this limitation, we propose a novel\nalgorithm named smoothed proximal linear descent-ascent (smoothed PLDA), which\ncan effectively handle a broad range of structured nonsmooth\nnonconvex-nonconcave minimax problems. Specifically, we consider the setting\nwhere the primal function has a nonsmooth composite structure and the dual\nfunction possesses the Kurdyka-Lojasiewicz (KL) property with exponent $\\theta\n\\in [0,1)$. We introduce a novel convergence analysis framework for smoothed\nPLDA, the key components of which are our newly developed nonsmooth primal\nerror bound and dual error bound. Using this framework, we show that smoothed\nPLDA can find both $\\epsilon$-game-stationary points and\n$\\epsilon$-optimization-stationary points of the problems of interest in\n$\\mathcal{O}(\\epsilon^{-2\\max\\{2\\theta,1\\}})$ iterations. Furthermore, when\n$\\theta \\in [0,\\frac{1}{2}]$, smoothed PLDA achieves the optimal iteration\ncomplexity of $\\mathcal{O}(\\epsilon^{-2})$. To further demonstrate the\neffectiveness and wide applicability of our analysis framework, we show that\ncertain max-structured problem possesses the KL property with exponent\n$\\theta=0$ under mild assumptions. As a by-product, we establish\nalgorithm-independent quantitative relationships among various stationarity\nconcepts, which may be of independent interest.\n","authors":["Jiajin Li","Linglingzhi Zhu","Anthony Man-Cho So"],"pdf_url":"https://arxiv.org/pdf/2209.10825v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2005.00695v3","updated":"2023-07-26T22:58:34Z","published":"2020-05-02T04:10:21Z","title":"On the Generalization Effects of Linear Transformations in Data\n  Augmentation","summary":"  Data augmentation is a powerful technique to improve performance in\napplications such as image and text classification tasks. Yet, there is little\nrigorous understanding of why and how various augmentations work. In this work,\nwe consider a family of linear transformations and study their effects on the\nridge estimator in an over-parametrized linear regression setting. First, we\nshow that transformations that preserve the labels of the data can improve\nestimation by enlarging the span of the training data. Second, we show that\ntransformations that mix data can improve estimation by playing a\nregularization effect. Finally, we validate our theoretical insights on MNIST.\nBased on the insights, we propose an augmentation scheme that searches over the\nspace of transformations by how uncertain the model is about the transformed\ndata. We validate our proposed scheme on image and text datasets. For example,\nour method outperforms random sampling methods by 1.24% on CIFAR-100 using\nWide-ResNet-28-10. Furthermore, we achieve comparable accuracy to the SoTA\nAdversarial AutoAugment on CIFAR-10, CIFAR-100, SVHN, and ImageNet datasets.\n","authors":["Sen Wu","Hongyang R. Zhang","Gregory Valiant","Christopher Ré"],"pdf_url":"https://arxiv.org/pdf/2005.00695v3.pdf","comment":"22 pages. Appeared in ICML 2020"},{"id":"http://arxiv.org/abs/2307.11892v2","updated":"2023-07-26T22:45:57Z","published":"2023-07-21T20:26:54Z","title":"On the Vulnerability of Fairness Constrained Learning to Malicious Noise","summary":"  We consider the vulnerability of fairness-constrained learning to small\namounts of malicious noise in the training data. Konstantinov and Lampert\n(2021) initiated the study of this question and presented negative results\nshowing there exist data distributions where for several fairness constraints,\nany proper learner will exhibit high vulnerability when group sizes are\nimbalanced. Here, we present a more optimistic view, showing that if we allow\nrandomized classifiers, then the landscape is much more nuanced. For example,\nfor Demographic Parity we show we can incur only a $\\Theta(\\alpha)$ loss in\naccuracy, where $\\alpha$ is the malicious noise rate, matching the best\npossible even without fairness constraints. For Equal Opportunity, we show we\ncan incur an $O(\\sqrt{\\alpha})$ loss, and give a matching\n$\\Omega(\\sqrt{\\alpha})$lower bound. In contrast, Konstantinov and Lampert\n(2021) showed for proper learners the loss in accuracy for both notions is\n$\\Omega(1)$. The key technical novelty of our work is how randomization can\nbypass simple \"tricks\" an adversary can use to amplify his power. We also\nconsider additional fairness notions including Equalized Odds and Calibration.\nFor these fairness notions, the excess accuracy clusters into three natural\nregimes $O(\\alpha)$,$O(\\sqrt{\\alpha})$ and $O(1)$. These results provide a more\nfine-grained view of the sensitivity of fairness-constrained learning to\nadversarial noise in training data.\n","authors":["Avrim Blum","Princewill Okoroafor","Aadirupa Saha","Kevin Stangl"],"pdf_url":"https://arxiv.org/pdf/2307.11892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14531v1","updated":"2023-07-26T22:39:47Z","published":"2023-07-26T22:39:47Z","title":"Controlling the Inductive Bias of Wide Neural Networks by Modifying the\n  Kernel's Spectrum","summary":"  Wide neural networks are biased towards learning certain functions,\ninfluencing both the rate of convergence of gradient descent (GD) and the\nfunctions that are reachable with GD in finite training time. As such, there is\na great need for methods that can modify this bias according to the task at\nhand. To that end, we introduce Modified Spectrum Kernels (MSKs), a novel\nfamily of constructed kernels that can be used to approximate kernels with\ndesired eigenvalues for which no closed form is known. We leverage the duality\nbetween wide neural networks and Neural Tangent Kernels and propose a\npreconditioned gradient descent method, which alters the trajectory of GD. As a\nresult, this allows for a polynomial and, in some cases, exponential training\nspeedup without changing the final solution. Our method is both computationally\nefficient and simple to implement.\n","authors":["Amnon Geifman","Daniel Barzilai","Ronen Basri","Meirav Galun"],"pdf_url":"https://arxiv.org/pdf/2307.14531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.05776v2","updated":"2023-07-26T22:35:39Z","published":"2022-08-10T16:04:58Z","title":"Neural Networks for Scalar Input and Functional Output","summary":"  The regression of a functional response on a set of scalar predictors can be\na challenging task, especially if there is a large number of predictors, or the\nrelationship between those predictors and the response is nonlinear. In this\nwork, we propose a solution to this problem: a feed-forward neural network (NN)\ndesigned to predict a functional response using scalar inputs. First, we\ntransform the functional response to a finite-dimensional representation and\nconstruct an NN that outputs this representation. Then, we propose to modify\nthe output of an NN via the objective function and introduce different\nobjective functions for network training. The proposed models are suited for\nboth regularly and irregularly spaced data, and a roughness penalty can be\nfurther applied to control the smoothness of the predicted curve. The\ndifficulty in implementing both those features lies in the definition of\nobjective functions that can be back-propagated. In our experiments, we\ndemonstrate that our model outperforms the conventional function-on-scalar\nregression model in multiple scenarios while computationally scaling better\nwith the dimension of the predictors.\n","authors":["Sidi Wu","Cédric Beaulac","Jiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2208.05776v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02060v2","updated":"2023-07-26T22:29:21Z","published":"2023-03-03T16:29:12Z","title":"Spectral learning of Bernoulli linear dynamical systems models","summary":"  Latent linear dynamical systems with Bernoulli observations provide a\npowerful modeling framework for identifying the temporal dynamics underlying\nbinary time series data, which arise in a variety of contexts such as binary\ndecision-making and discrete stochastic processes (e.g., binned neural spike\ntrains). Here we develop a spectral learning method for fast, efficient fitting\nof probit-Bernoulli latent linear dynamical system (LDS) models. Our approach\nextends traditional subspace identification methods to the Bernoulli setting\nvia a transformation of the first and second sample moments. This results in a\nrobust, fixed-cost estimator that avoids the hazards of local optima and the\nlong computation time of iterative fitting procedures like the\nexpectation-maximization (EM) algorithm. In regimes where data is limited or\nassumptions about the statistical structure of the data are not met, we\ndemonstrate that the spectral estimate provides a good initialization for\nLaplace-EM fitting. Finally, we show that the estimator provides substantial\nbenefits to real world settings by analyzing data from mice performing a\nsensory decision-making task.\n","authors":["Iris R. Stone","Yotam Sagiv","Il Memming Park","Jonathan W. Pillow"],"pdf_url":"https://arxiv.org/pdf/2303.02060v2.pdf","comment":"Published in Transactions on Machine Learning Research\n  (https://jmlr.org/tmlr/papers/)"},{"id":"http://arxiv.org/abs/2307.14530v1","updated":"2023-07-26T22:27:08Z","published":"2023-07-26T22:27:08Z","title":"Optimal Estimation in Mixed-Membership Stochastic Block Models","summary":"  Community detection is one of the most critical problems in modern network\nscience. Its applications can be found in various fields, from protein modeling\nto social network analysis. Recently, many papers appeared studying the problem\nof overlapping community detection, where each node of a network may belong to\nseveral communities. In this work, we consider Mixed-Membership Stochastic\nBlock Model (MMSB) first proposed by Airoldi et al. (2008). MMSB provides quite\na general setting for modeling overlapping community structure in graphs. The\ncentral question of this paper is to reconstruct relations between communities\ngiven an observed network. We compare different approaches and establish the\nminimax lower bound on the estimation error. Then, we propose a new estimator\nthat matches this lower bound. Theoretical results are proved under fairly\ngeneral conditions on the considered model. Finally, we illustrate the theory\nin a series of experiments.\n","authors":["Fedor Noskov","Maxim Panov"],"pdf_url":"https://arxiv.org/pdf/2307.14530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.10255v2","updated":"2023-07-26T22:25:35Z","published":"2022-08-22T12:29:02Z","title":"On the non-efficient PAC learnability of conjunctive queries","summary":"  This note serves three purposes: (i) we provide a self-contained exposition\nof the fact that conjunctive queries are not efficiently learnable in the\nProbably-Approximately-Correct (PAC) model, paying clear attention to the\ncomplicating fact that this concept class lacks the polynomial-size fitting\nproperty, a property that is tacitly assumed in much of the computational\nlearning theory literature; (ii) we establish a strong negative PAC\nlearnability result that applies to many restricted classes of conjunctive\nqueries (CQs), including acyclic CQs for a wide range of notions of\n\"acyclicity\"; (iii) we show that CQs (and UCQs) are efficiently PAC learnable\nwith membership queries.\n","authors":["Balder ten Cate","Maurice Funk","Jean Christoph Jung","Carsten Lutz"],"pdf_url":"https://arxiv.org/pdf/2208.10255v2.pdf","comment":"To appear in Information Processing Letters"},{"id":"http://arxiv.org/abs/2301.01913v2","updated":"2023-07-26T22:15:08Z","published":"2023-01-05T05:13:48Z","title":"Learning a Generic Value-Selection Heuristic Inside a Constraint\n  Programming Solver","summary":"  Constraint programming is known for being an efficient approach for solving\ncombinatorial problems. Important design choices in a solver are the branching\nheuristics, which are designed to lead the search to the best solutions in a\nminimum amount of time. However, developing these heuristics is a\ntime-consuming process that requires problem-specific expertise. This\nobservation has motivated many efforts to use machine learning to automatically\nlearn efficient heuristics without expert intervention. To the best of our\nknowledge, it is still an open research question. Although several generic\nvariable-selection heuristics are available in the literature, the options for\na generic value-selection heuristic are more scarce. In this paper, we propose\nto tackle this issue by introducing a generic learning procedure that can be\nused to obtain a value-selection heuristic inside a constraint programming\nsolver. This has been achieved thanks to the combination of a deep Q-learning\nalgorithm, a tailored reward signal, and a heterogeneous graph neural network\narchitecture. Experiments on graph coloring, maximum independent set, and\nmaximum cut problems show that our framework is able to find better solutions\nclose to optimality without requiring a large amounts of backtracks while being\ngeneric.\n","authors":["Tom Marty","Tristan François","Pierre Tessier","Louis Gauthier","Louis-Martin Rousseau","Quentin Cappart"],"pdf_url":"https://arxiv.org/pdf/2301.01913v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2307.14528v1","updated":"2023-07-26T22:12:31Z","published":"2023-07-26T22:12:31Z","title":"Function Value Learning: Adaptive Learning Rates Based on the Polyak\n  Stepsize and Function Splitting in ERM","summary":"  Here we develop variants of SGD (stochastic gradient descent) with an\nadaptive step size that make use of the sampled loss values. In particular, we\nfocus on solving a finite sum-of-terms problem, also known as empirical risk\nminimization. We first detail an idealized adaptive method called\n$\\texttt{SPS}_+$ that makes use of the sampled loss values and assumes\nknowledge of the sampled loss at optimality. This $\\texttt{SPS}_+$ is a minor\nmodification of the SPS (Stochastic Polyak Stepsize) method, where the step\nsize is enforced to be positive. We then show that $\\texttt{SPS}_+$ achieves\nthe best known rates of convergence for SGD in the Lipschitz non-smooth. We\nthen move onto to develop $\\texttt{FUVAL}$, a variant of $\\texttt{SPS}_+$ where\nthe loss values at optimality are gradually learned, as opposed to being given.\nWe give three viewpoints of $\\texttt{FUVAL}$, as a projection based method, as\na variant of the prox-linear method, and then as a particular online SGD\nmethod. We then present a convergence analysis of $\\texttt{FUVAL}$ and\nexperimental results. The shortcomings of our work is that the convergence\nanalysis of $\\texttt{FUVAL}$ shows no advantage over SGD. Another shortcomming\nis that currently only the full batch version of $\\texttt{FUVAL}$ shows a minor\nadvantages of GD (Gradient Descent) in terms of sensitivity to the step size.\nThe stochastic version shows no clear advantage over SGD. We conjecture that\nlarge mini-batches are required to make $\\texttt{FUVAL}$ competitive.\n  Currently the new $\\texttt{FUVAL}$ method studied in this paper does not\noffer any clear theoretical or practical advantage. We have chosen to make this\ndraft available online nonetheless because of some of the analysis techniques\nwe use, such as the non-smooth analysis of $\\texttt{SPS}_+$, and also to show\nan apparently interesting approach that currently does not work.\n","authors":["Guillaume Garrigos","Robert M. Gower","Fabian Schaipp"],"pdf_url":"https://arxiv.org/pdf/2307.14528v1.pdf","comment":"38 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.14527v1","updated":"2023-07-26T22:09:29Z","published":"2023-07-26T22:09:29Z","title":"Open Problems in Computer Vision for Wilderness SAR and The Search for\n  Patricia Wu-Murad","summary":"  This paper details the challenges in applying two computer vision systems, an\nEfficientDET supervised learning model and the unsupervised RX spectral\nclassifier, to 98.9 GB of drone imagery from the Wu-Murad wilderness search and\nrescue (WSAR) effort in Japan and identifies 3 directions for future research.\nThere have been at least 19 proposed approaches and 3 datasets aimed at\nlocating missing persons in drone imagery, but only 3 approaches (2\nunsupervised and 1 of an unknown structure) are referenced in the literature as\nhaving been used in an actual WSAR operation. Of these proposed approaches, the\nEfficientDET architecture and the unsupervised spectral RX classifier were\nselected as the most appropriate for this setting. The EfficientDET model was\napplied to the HERIDAL dataset and despite achieving performance that is\nstatistically equivalent to the state-of-the-art, the model fails to translate\nto the real world in terms of false positives (e.g., identifying tree limbs and\nrocks as people), and false negatives (e.g., failing to identify members of the\nsearch team). The poor results in practice for algorithms that showed good\nresults on datasets suggest 3 areas of future research: more realistic datasets\nfor wilderness SAR, computer vision models that are capable of seamlessly\nhandling the variety of imagery that can be collected during actual WSAR\noperations, and better alignment on performance measures.\n","authors":["Thomas Manzini","Robin Murphy"],"pdf_url":"https://arxiv.org/pdf/2307.14527v1.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2207.00052v3","updated":"2023-07-26T21:55:11Z","published":"2022-06-30T18:35:00Z","title":"Visual Pre-training for Navigation: What Can We Learn from Noise?","summary":"  One powerful paradigm in visual navigation is to predict actions from\nobservations directly. Training such an end-to-end system allows\nrepresentations useful for downstream tasks to emerge automatically. However,\nthe lack of inductive bias makes this system data inefficient. We hypothesize a\nsufficient representation of the current view and the goal view for a\nnavigation policy can be learned by predicting the location and size of a crop\nof the current view that corresponds to the goal. We further show that training\nsuch random crop prediction in a self-supervised fashion purely on synthetic\nnoise images transfers well to natural home images. The learned representation\ncan then be bootstrapped to learn a navigation policy efficiently with little\ninteraction data. The code is available at https://yanweiw.github.io/noise2ptz\n","authors":["Yanwei Wang","Ching-Yun Ko","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2207.00052v3.pdf","comment":"IROS 2023"},{"id":"http://arxiv.org/abs/2307.14512v1","updated":"2023-07-26T21:21:02Z","published":"2023-07-26T21:21:02Z","title":"Bug Characterization in Machine Learning-based Systems","summary":"  Rapid growth of applying Machine Learning (ML) in different domains,\nespecially in safety-critical areas, increases the need for reliable ML\ncomponents, i.e., a software component operating based on ML. Understanding the\nbugs characteristics and maintenance challenges in ML-based systems can help\ndevelopers of these systems to identify where to focus maintenance and testing\nefforts, by giving insights into the most error-prone components, most common\nbugs, etc. In this paper, we investigate the characteristics of bugs in\nML-based software systems and the difference between ML and non-ML bugs from\nthe maintenance viewpoint. We extracted 447,948 GitHub repositories that used\none of the three most popular ML frameworks, i.e., TensorFlow, Keras, and\nPyTorch. After multiple filtering steps, we select the top 300 repositories\nwith the highest number of closed issues. We manually investigate the extracted\nrepositories to exclude non-ML-based systems. Our investigation involved a\nmanual inspection of 386 sampled reported issues in the identified ML-based\nsystems to indicate whether they affect ML components or not. Our analysis\nshows that nearly half of the real issues reported in ML-based systems are ML\nbugs, indicating that ML components are more error-prone than non-ML\ncomponents. Next, we thoroughly examined 109 identified ML bugs to identify\ntheir root causes, symptoms, and calculate their required fixing time. The\nresults also revealed that ML bugs have significantly different characteristics\ncompared to non-ML bugs, in terms of the complexity of bug-fixing (number of\ncommits, changed files, and changed lines of code). Based on our results,\nfixing ML bugs are more costly and ML components are more error-prone, compared\nto non-ML bugs and non-ML components respectively. Hence, paying a significant\nattention to the reliability of the ML components is crucial in ML-based\nsystems.\n","authors":["Mohammad Mehdi Morovati","Amin Nikanjam","Florian Tambon","Foutse Khomh","Zhen Ming"," Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.14512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14500v1","updated":"2023-07-26T20:58:47Z","published":"2023-07-26T20:58:47Z","title":"A Predictive Model of Digital Information Engagement: Forecasting User\n  Engagement With English Words by Incorporating Cognitive Biases,\n  Computational Linguistics and Natural Language Processing","summary":"  This study introduces and empirically tests a novel predictive model for\ndigital information engagement (IE) - the READ model, an acronym for the four\npivotal attributes of engaging information: Representativeness, Ease-of-use,\nAffect, and Distribution. Conceptualized within the theoretical framework of\nCumulative Prospect Theory, the model integrates key cognitive biases with\ncomputational linguistics and natural language processing to develop a\nmultidimensional perspective on information engagement. A rigorous testing\nprotocol was implemented, involving 50 randomly selected pairs of synonymous\nwords (100 words in total) from the WordNet database. These words' engagement\nlevels were evaluated through a large-scale online survey (n = 80,500) to\nderive empirical IE metrics. The READ attributes for each word were then\ncomputed and their predictive efficacy examined. The findings affirm the READ\nmodel's robustness, accurately predicting a word's IE level and distinguishing\nthe more engaging word from a pair of synonyms with an 84% accuracy rate. The\nREAD model's potential extends across various domains, including business,\neducation, government, and healthcare, where it could enhance content\nengagement and inform AI language model development and generative text work.\nFuture research should address the model's scalability and adaptability across\ndifferent domains and languages, thereby broadening its applicability and\nefficacy.\n","authors":["Nimrod Dvir","Elaine Friedman","Suraj Commuri","Fan yang","Jennifer Romano"],"pdf_url":"https://arxiv.org/pdf/2307.14500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.11838v3","updated":"2023-07-26T20:43:33Z","published":"2022-08-25T02:58:23Z","title":"Learning Task Automata for Reinforcement Learning using Hidden Markov\n  Models","summary":"  Training reinforcement learning (RL) agents using scalar reward signals is\noften infeasible when an environment has sparse and non-Markovian rewards.\nMoreover, handcrafting these reward functions before training is prone to\nmisspecification, especially when the environment's dynamics are only partially\nknown. This paper proposes a novel pipeline for learning non-Markovian task\nspecifications as succinct finite-state `task automata' from episodes of agent\nexperience within unknown environments. We leverage two key algorithmic\ninsights. First, we learn a product MDP, a model composed of the\nspecification's automaton and the environment's MDP (both initially unknown),\nby treating the product MDP as a partially observable MDP and using the\nwell-known Baum-Welch algorithm for learning hidden Markov models. Second, we\npropose a novel method for distilling the task automaton (assumed to be a\ndeterministic finite automaton) from the learnt product MDP. Our learnt task\nautomaton enables the decomposition of a task into its constituent sub-tasks,\nwhich improves the rate at which an RL agent can later synthesise an optimal\npolicy. It also provides an interpretable encoding of high-level environmental\nand task features, so a human can readily verify that the agent has learnt\ncoherent tasks with no misspecifications. In addition, we take steps towards\nensuring that the learnt automaton is environment-agnostic, making it\nwell-suited for use in transfer learning. Finally, we provide experimental\nresults compared with two baselines to illustrate our algorithm's performance\nin different environments and tasks.\n","authors":["Alessandro Abate","Yousif Almulla","James Fox","David Hyland","Michael Wooldridge"],"pdf_url":"https://arxiv.org/pdf/2208.11838v3.pdf","comment":"14 pages, 7 figures, Accepted to the 26th European Conference on\n  Artificial Intelligence (ECAI 2023)"},{"id":"http://arxiv.org/abs/2307.14490v1","updated":"2023-07-26T20:29:15Z","published":"2023-07-26T20:29:15Z","title":"HUGE: Huge Unsupervised Graph Embeddings with TPUs","summary":"  Graphs are a representation of structured data that captures the\nrelationships between sets of objects. With the ubiquity of available network\ndata, there is increasing industrial and academic need to quickly analyze\ngraphs with billions of nodes and trillions of edges. A common first step for\nnetwork understanding is Graph Embedding, the process of creating a continuous\nrepresentation of nodes in a graph. A continuous representation is often more\namenable, especially at scale, for solving downstream machine learning tasks\nsuch as classification, link prediction, and clustering. A high-performance\ngraph embedding architecture leveraging Tensor Processing Units (TPUs) with\nconfigurable amounts of high-bandwidth memory is presented that simplifies the\ngraph embedding problem and can scale to graphs with billions of nodes and\ntrillions of edges. We verify the embedding space quality on real and synthetic\nlarge-scale datasets.\n","authors":["Brandon Mayer","Anton Tsitsulin","Hendrik Fichtenberger","Jonathan Halcrow","Bryan Perozzi"],"pdf_url":"https://arxiv.org/pdf/2307.14490v1.pdf","comment":"As appeared at KDD 2023"},{"id":"http://arxiv.org/abs/2307.14482v1","updated":"2023-07-26T20:15:19Z","published":"2023-07-26T20:15:19Z","title":"Role of Image Acquisition and Patient Phenotype Variations in Automatic\n  Segmentation Model Generalization","summary":"  Purpose: This study evaluated the out-of-domain performance and\ngeneralization capabilities of automated medical image segmentation models,\nwith a particular focus on adaptation to new image acquisitions and disease\ntype.\n  Materials: Datasets from both non-contrast and contrast-enhanced abdominal CT\nscans of healthy patients and those with polycystic kidney disease (PKD) were\nused. A total of 400 images (100 non-contrast controls, 100 contrast controls,\n100 non-contrast PKD, 100 contrast PKD) were utilized for training/validation\nof models to segment kidneys, livers, and spleens, and the final models were\nthen tested on 100 non-contrast CT images of patients affected by PKD.\nPerformance was evaluated using Dice, Jaccard, TPR, and Precision.\n  Results: Models trained on a diverse range of data showed no worse\nperformance than models trained exclusively on in-domain data when tested on\nin-domain data. For instance, the Dice similarity of the model trained on 25%\nfrom each dataset was found to be non-inferior to the model trained purely on\nin-domain data.\n  Conclusions: The results indicate that broader training examples\nsignificantly enhances model generalization and out-of-domain performance,\nthereby improving automated segmentation tools' applicability in clinical\nsettings. The study's findings provide a roadmap for future research to adopt a\ndata-centric approach in medical image AI model development.\n","authors":["Timothy L. Kline","Sumana Ramanathan","Harrison C. Gottlich","Panagiotis Korfiatis","Adriana V. Gregory"],"pdf_url":"https://arxiv.org/pdf/2307.14482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19472v2","updated":"2023-07-26T19:45:19Z","published":"2023-05-31T00:55:40Z","title":"PlaSma: Making Small Language Models Better Procedural Knowledge Models\n  for (Counterfactual) Planning","summary":"  Procedural planning, which entails decomposing a high-level goal into a\nsequence of temporally ordered steps, is an important yet intricate task for\nmachines. It involves integrating common-sense knowledge to reason about\ncomplex contextualized situations that are often counterfactual, e.g.\n\"scheduling a doctor's appointment without a phone\". While current approaches\nshow encouraging results using large language models (LLMs), they are hindered\nby drawbacks such as costly API calls and reproducibility issues. In this\npaper, we advocate planning using smaller language models. We present PlaSma, a\nnovel two-pronged approach to endow small language models with procedural\nknowledge and (counterfactual) planning capabilities. More concretely, we\ndevelop symbolic procedural knowledge distillation to enhance the implicit\nknowledge in small language models and an inference-time algorithm to\nfacilitate more structured and accurate reasoning. In addition, we introduce a\nnovel task, Counterfactual Planning, that requires a revision of a plan to cope\nwith a counterfactual situation. In both the original and counterfactual\nsetting, we show that orders-of-magnitude smaller models (770M-11B parameters)\ncan compete and often surpass their larger teacher models' capabilities.\n","authors":["Faeze Brahman","Chandra Bhagavatula","Valentina Pyatkin","Jena D. Hwang","Xiang Lorraine Li","Hirona J. Arai","Soumya Sanyal","Keisuke Sakaguchi","Xiang Ren","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2305.19472v2.pdf","comment":"cited new paper, 27 pages"},{"id":"http://arxiv.org/abs/2307.14474v1","updated":"2023-07-26T19:41:05Z","published":"2023-07-26T19:41:05Z","title":"Limits to Reservoir Learning","summary":"  In this work, we bound a machine's ability to learn based on computational\nlimitations implied by physicality. We start by considering the information\nprocessing capacity (IPC), a normalized measure of the expected squared error\nof a collection of signals to a complete basis of functions. We use the IPC to\nmeasure the degradation under noise of the performance of reservoir computers,\na particular kind of recurrent network, when constrained by physical\nconsiderations. First, we show that the IPC is at most a polynomial in the\nsystem size $n$, even when considering the collection of $2^n$ possible\npointwise products of the $n$ output signals. Next, we argue that this\ndegradation implies that the family of functions represented by the reservoir\nrequires an exponential number of samples to learn in the presence of the\nreservoir's noise. Finally, we conclude with a discussion of the performance of\nthe same collection of $2^n$ functions without noise when being used for binary\nclassification.\n","authors":["Anthony M. Polloreno"],"pdf_url":"https://arxiv.org/pdf/2307.14474v1.pdf","comment":"13 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.14465v1","updated":"2023-07-26T19:11:44Z","published":"2023-07-26T19:11:44Z","title":"What Kinds of Contracts Do ML APIs Need?","summary":"  Recent work has shown that Machine Learning (ML) programs are error-prone and\ncalled for contracts for ML code. Contracts, as in the design by contract\nmethodology, help document APIs and aid API users in writing correct code. The\nquestion is: what kinds of contracts would provide the most help to API users?\nWe are especially interested in what kinds of contracts help API users catch\nerrors at earlier stages in the ML pipeline. We describe an empirical study of\nposts on Stack Overflow of the four most often-discussed ML libraries:\nTensorFlow, Scikit-learn, Keras, and PyTorch. For these libraries, our study\nextracted 413 informal (English) API specifications. We used these\nspecifications to understand the following questions. What are the root causes\nand effects behind ML contract violations? Are there common patterns of ML\ncontract violations? When does understanding ML contracts require an advanced\nlevel of ML software expertise? Could checking contracts at the API level help\ndetect the violations in early ML pipeline stages? Our key findings are that\nthe most commonly needed contracts for ML APIs are either checking constraints\non single arguments of an API or on the order of API calls. The software\nengineering community could employ existing contract mining approaches to mine\nthese contracts to promote an increased understanding of ML APIs. We also noted\na need to combine behavioral and temporal contract mining approaches. We report\non categories of required ML contracts, which may help designers of contract\nlanguages.\n","authors":["Samantha Syeda Khairunnesa","Shibbir Ahmed","Sayem Mohammad Imtiaz","Hridesh Rajan","Gary T. Leavens"],"pdf_url":"https://arxiv.org/pdf/2307.14465v1.pdf","comment":"Accepted for publication at EMSE (Empirical Software Engineering)\n  Journal, 2023"},{"id":"http://arxiv.org/abs/2307.14459v1","updated":"2023-07-26T19:00:10Z","published":"2023-07-26T19:00:10Z","title":"Training Quantum Boltzmann Machines with Coresets","summary":"  Recent work has proposed and explored using coreset techniques for quantum\nalgorithms that operate on classical data sets to accelerate the applicability\nof these algorithms on near-term quantum devices. We apply these ideas to\nQuantum Boltzmann Machines (QBM) where gradient-based steps which require Gibbs\nstate sampling are the main computational bottleneck during training. By using\na coreset in place of the full data set, we try to minimize the number of steps\nneeded and accelerate the overall training time. In a regime where\ncomputational time on quantum computers is a precious resource, we propose this\nmight lead to substantial practical savings. We evaluate this approach on 6x6\nbinary images from an augmented bars and stripes data set using a QBM with 36\nvisible units and 8 hidden units. Using an Inception score inspired metric, we\ncompare QBM training times with and without using coresets.\n","authors":["Joshua Viszlai","Teague Tomesh","Pranav Gokhale","Eric Anschuetz","Frederic T. Chong"],"pdf_url":"https://arxiv.org/pdf/2307.14459v1.pdf","comment":"Appeared in IEEE International Conference on Quantum Computing and\n  Engineering (QCE22) in September 2022"},{"id":"http://arxiv.org/abs/2307.14453v1","updated":"2023-07-26T18:50:32Z","published":"2023-07-26T18:50:32Z","title":"Predictive Maintenance of Armoured Vehicles using Machine Learning\n  Approaches","summary":"  Armoured vehicles are specialized and complex pieces of machinery designed to\noperate in high-stress environments, often in combat or tactical situations.\nThis study proposes a predictive maintenance-based ensemble system that aids in\npredicting potential maintenance needs based on sensor data collected from\nthese vehicles. The proposed model's architecture involves various models such\nas Light Gradient Boosting, Random Forest, Decision Tree, Extra Tree Classifier\nand Gradient Boosting to predict the maintenance requirements of the vehicles\naccurately. In addition, K-fold cross validation, along with TOPSIS analysis,\nis employed to evaluate the proposed ensemble model's stability. The results\nindicate that the proposed system achieves an accuracy of 98.93%, precision of\n99.80% and recall of 99.03%. The algorithm can effectively predict maintenance\nneeds, thereby reducing vehicle downtime and improving operational efficiency.\nThrough comparisons between various algorithms and the suggested ensemble, this\nstudy highlights the potential of machine learning-based predictive maintenance\nsolutions.\n","authors":["Prajit Sengupta","Anant Mehta","Prashant Singh Rana"],"pdf_url":"https://arxiv.org/pdf/2307.14453v1.pdf","comment":"In Conference Proceedings of INTERNATIONAL CONFERENCE ON COMPUTER\n  SCIENCE, MACHINE LEARNING AND ARTIFICIAL INTELLIGENCE (pg:25-31) - (New\n  Delhi, 2023)"},{"id":"http://arxiv.org/abs/2307.14448v1","updated":"2023-07-26T18:40:07Z","published":"2023-07-26T18:40:07Z","title":"VISPUR: Visual Aids for Identifying and Interpreting Spurious\n  Associations in Data-Driven Decisions","summary":"  Big data and machine learning tools have jointly empowered humans in making\ndata-driven decisions. However, many of them capture empirical associations\nthat might be spurious due to confounding factors and subgroup heterogeneity.\nThe famous Simpson's paradox is such a phenomenon where aggregated and\nsubgroup-level associations contradict with each other, causing cognitive\nconfusions and difficulty in making adequate interpretations and decisions.\nExisting tools provide little insights for humans to locate, reason about, and\nprevent pitfalls of spurious association in practice. We propose VISPUR, a\nvisual analytic system that provides a causal analysis framework and a\nhuman-centric workflow for tackling spurious associations. These include a\nCONFOUNDER DASHBOARD, which can automatically identify possible confounding\nfactors, and a SUBGROUP VIEWER, which allows for the visualization and\ncomparison of diverse subgroup patterns that likely or potentially result in a\nmisinterpretation of causality. Additionally, we propose a REASONING\nSTORYBOARD, which uses a flow-based approach to illustrate paradoxical\nphenomena, as well as an interactive DECISION DIAGNOSIS panel that helps ensure\naccountable decision-making. Through an expert interview and a controlled user\nexperiment, our qualitative and quantitative results demonstrate that the\nproposed \"de-paradox\" workflow and the designed visual analytic system are\neffective in helping human users to identify and understand spurious\nassociations, as well as to make accountable causal decisions.\n","authors":["Xian Teng","Yongsu Ahn","Yu-Ru Lin"],"pdf_url":"https://arxiv.org/pdf/2307.14448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08944v3","updated":"2023-07-26T18:39:17Z","published":"2022-11-16T14:49:10Z","title":"Reasons for the Superiority of Stochastic Estimators over Deterministic\n  Ones: Robustness, Consistency and Perceptual Quality","summary":"  Stochastic restoration algorithms allow to explore the space of solutions\nthat correspond to the degraded input. In this paper we reveal additional\nfundamental advantages of stochastic methods over deterministic ones, which\nfurther motivate their use. First, we prove that any restoration algorithm that\nattains perfect perceptual quality and whose outputs are consistent with the\ninput must be a posterior sampler, and is thus required to be stochastic.\nSecond, we illustrate that while deterministic restoration algorithms may\nattain high perceptual quality, this can be achieved only by filling up the\nspace of all possible source images using an extremely sensitive mapping, which\nmakes them highly vulnerable to adversarial attacks. Indeed, we show that\nenforcing deterministic models to be robust to such attacks profoundly hinders\ntheir perceptual quality, while robustifying stochastic models hardly\ninfluences their perceptual quality, and improves their output variability.\nThese findings provide a motivation to foster progress in stochastic\nrestoration methods, paving the way to better recovery algorithms.\n","authors":["Guy Ohayon","Theo Adrai","Michael Elad","Tomer Michaeli"],"pdf_url":"https://arxiv.org/pdf/2211.08944v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14439v1","updated":"2023-07-26T18:16:43Z","published":"2023-07-26T18:16:43Z","title":"Fixed Integral Neural Networks","summary":"  It is often useful to perform integration over learned functions represented\nby neural networks. However, this integration is usually performed numerically,\nas analytical integration over learned functions (especially neural networks)\nis generally viewed as intractable. In this work, we present a method for\nrepresenting the analytical integral of a learned function $f$. This allows the\nexact integral of a neural network to be computed, and enables constrained\nneural networks to be parametrised by applying constraints directly to the\nintegral. Crucially, we also introduce a method to constrain $f$ to be\npositive, a necessary condition for many applications (e.g. probability\ndistributions, distance metrics, etc). Finally, we introduce several\napplications where our fixed-integral neural network (FINN) can be utilised.\n","authors":["Ryan Kortvelesy"],"pdf_url":"https://arxiv.org/pdf/2307.14439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14403v1","updated":"2023-07-26T17:25:28Z","published":"2023-07-26T17:25:28Z","title":"Unsupervised Deep Learning-based Pansharpening with Jointly-Enhanced\n  Spectral and Spatial Fidelity","summary":"  In latest years, deep learning has gained a leading role in the pansharpening\nof multiresolution images. Given the lack of ground truth data, most deep\nlearning-based methods carry out supervised training in a reduced-resolution\ndomain. However, models trained on downsized images tend to perform poorly on\nhigh-resolution target images. For this reason, several research groups are now\nturning to unsupervised training in the full-resolution domain, through the\ndefinition of appropriate loss functions and training paradigms. In this\ncontext, we have recently proposed a full-resolution training framework which\ncan be applied to many existing architectures.\n  Here, we propose a new deep learning-based pansharpening model that fully\nexploits the potential of this approach and provides cutting-edge performance.\nBesides architectural improvements with respect to previous work, such as the\nuse of residual attention modules, the proposed model features a novel loss\nfunction that jointly promotes the spectral and spatial quality of the\npansharpened data. In addition, thanks to a new fine-tuning strategy, it\nimproves inference-time adaptation to target images. Experiments on a large\nvariety of test images, performed in challenging scenarios, demonstrate that\nthe proposed method compares favorably with the state of the art both in terms\nof numerical results and visual output. Code is available online at\nhttps://github.com/matciotola/Lambda-PNN.\n","authors":["Matteo Ciotola","Giovanni Poggi","Giuseppe Scarpa"],"pdf_url":"https://arxiv.org/pdf/2307.14403v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.14335v1","updated":"2023-07-26T17:54:04Z","published":"2023-07-26T17:54:04Z","title":"WavJourney: Compositional Audio Creation with Large Language Models","summary":"  Large Language Models (LLMs) have shown great promise in integrating diverse\nexpert models to tackle intricate language and vision tasks. Despite their\nsignificance in advancing the field of Artificial Intelligence Generated\nContent (AIGC), their potential in intelligent audio content creation remains\nunexplored. In this work, we tackle the problem of creating audio content with\nstorylines encompassing speech, music, and sound effects, guided by text\ninstructions. We present WavJourney, a system that leverages LLMs to connect\nvarious audio models for audio content generation. Given a text description of\nan auditory scene, WavJourney first prompts LLMs to generate a structured\nscript dedicated to audio storytelling. The audio script incorporates diverse\naudio elements, organized based on their spatio-temporal relationships. As a\nconceptual representation of audio, the audio script provides an interactive\nand interpretable rationale for human engagement. Afterward, the audio script\nis fed into a script compiler, converting it into a computer program. Each line\nof the program calls a task-specific audio generation model or computational\noperation function (e.g., concatenate, mix). The computer program is then\nexecuted to obtain an explainable solution for audio generation. We demonstrate\nthe practicality of WavJourney across diverse real-world scenarios, including\nscience fiction, education, and radio play. The explainable and interactive\ndesign of WavJourney fosters human-machine co-creation in multi-round\ndialogues, enhancing creative control and adaptability in audio production.\nWavJourney audiolizes the human imagination, opening up new avenues for\ncreativity in multimedia content creation.\n","authors":["Xubo Liu","Zhongkai Zhu","Haohe Liu","Yi Yuan","Meng Cui","Qiushi Huang","Jinhua Liang","Yin Cao","Qiuqiang Kong","Mark D. Plumbley","Wenwu Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14335v1.pdf","comment":"Project Page: https://audio-agi.github.io/WavJourney_demopage/"},{"id":"http://arxiv.org/abs/2307.14244v1","updated":"2023-07-26T15:15:13Z","published":"2023-07-26T15:15:13Z","title":"Neural-based Cross-modal Search and Retrieval of Artwork","summary":"  Creating an intelligent search and retrieval system for artwork images,\nparticularly paintings, is crucial for documenting cultural heritage, fostering\nwider public engagement, and advancing artistic analysis and interpretation.\nVisual-Semantic Embedding (VSE) networks are deep learning models used for\ninformation retrieval, which learn joint representations of textual and visual\ndata, enabling 1) cross-modal search and retrieval tasks, such as image-to-text\nand text-to-image retrieval; and 2) relation-focused retrieval to capture\nentity relationships and provide more contextually relevant search results.\nAlthough VSE networks have played a significant role in cross-modal information\nretrieval, their application to painting datasets, such as ArtUK, remains\nunexplored. This paper introduces BoonArt, a VSE-based cross-modal search\nengine that allows users to search for images using textual queries, and to\nobtain textual descriptions along with the corresponding images when using\nimage queries. The performance of BoonArt was evaluated using the ArtUK\ndataset. Experimental evaluations revealed that BoonArt achieved 97% Recall@10\nfor image-to-text retrieval, and 97.4% Recall@10 for text-to-image Retrieval.\nBy bridging the gap between textual and visual modalities, BoonArt provides a\nmuch-improved search performance compared to traditional search engines, such\nas the one provided by the ArtUK website. BoonArt can be utilised to work with\nother artwork datasets.\n","authors":["Yan Gong","Georgina Cosma","Axel Finke"],"pdf_url":"https://arxiv.org/pdf/2307.14244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14240v1","updated":"2023-07-26T15:08:02Z","published":"2023-07-26T15:08:02Z","title":"Boon: A Neural Search Engine for Cross-Modal Information Retrieval","summary":"  Visual-Semantic Embedding (VSE) networks can help search engines better\nunderstand the meaning behind visual content and associate it with relevant\ntextual information, leading to more accurate search results. VSE networks can\nbe used in cross-modal search engines to embed image and textual descriptions\nin a shared space, enabling image-to-text and text-to-image retrieval tasks.\nHowever, the full potential of VSE networks for search engines has yet to be\nfully explored. This paper presents Boon, a novel cross-modal search engine\nthat combines two state-of-the-art networks: the GPT-3.5-turbo large language\nmodel, and the VSE network VITR (VIsion Transformers with Relation-focused\nlearning) to enhance the engine's capabilities in extracting and reasoning with\nregional relationships in images. VITR employs encoders from CLIP that were\ntrained with 400 million image-description pairs and it was fine-turned on the\nRefCOCOg dataset. Boon's neural-based components serve as its main\nfunctionalities: 1) a 'cross-modal search engine' that enables end-users to\nperform image-to-text and text-to-image retrieval. 2) a 'multi-lingual\nconversational AI' component that enables the end-user to converse about one or\nmore images selected by the end-user. Such a feature makes the search engine\naccessible to a wide audience, including those with visual impairments. 3) Boon\nis multi-lingual and can take queries and handle conversations about images in\nmultiple languages. Boon was implemented using the Django and PyTorch\nframeworks. The interface and capabilities of the Boon search engine are\ndemonstrated using the RefCOCOg dataset, and the engine's ability to search for\nmultimedia through the web is facilitated by Google's API.\n","authors":["Yan Gong","Georgina Cosma"],"pdf_url":"https://arxiv.org/pdf/2307.14240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14119v1","updated":"2023-07-26T11:38:45Z","published":"2023-07-26T11:38:45Z","title":"A semantics-driven methodology for high-quality image annotation","summary":"  Recent work in Machine Learning and Computer Vision has highlighted the\npresence of various types of systematic flaws inside ground truth object\nrecognition benchmark datasets. Our basic tenet is that these flaws are rooted\nin the many-to-many mappings which exist between the visual information encoded\nin images and the intended semantics of the labels annotating them. The net\nconsequence is that the current annotation process is largely under-specified,\nthus leaving too much freedom to the subjective judgment of annotators. In this\npaper, we propose vTelos, an integrated Natural Language Processing, Knowledge\nRepresentation, and Computer Vision methodology whose main goal is to make\nexplicit the (otherwise implicit) intended annotation semantics, thus\nminimizing the number and role of subjective choices. A key element of vTelos\nis the exploitation of the WordNet lexico-semantic hierarchy as the main means\nfor providing the meaning of natural language labels and, as a consequence, for\ndriving the annotation of images based on the objects and the visual properties\nthey depict. The methodology is validated on images populating a subset of the\nImageNet hierarchy.\n","authors":["Fausto Giunchiglia","Mayukh Bagchi","Xiaolei Diao"],"pdf_url":"https://arxiv.org/pdf/2307.14119v1.pdf","comment":"Accepted @ 26th European Conference on Artificial Intelligence (ECAI)\n  2023, Krak\\'ow, Poland"},{"id":"http://arxiv.org/abs/2307.13981v1","updated":"2023-07-26T06:38:33Z","published":"2023-07-26T06:38:33Z","title":"Analysis of Video Quality Datasets via Design of Minimalistic Video\n  Quality Models","summary":"  Blind video quality assessment (BVQA) plays an indispensable role in\nmonitoring and improving the end-users' viewing experience in various\nreal-world video-enabled media applications. As an experimental field, the\nimprovements of BVQA models have been measured primarily on a few human-rated\nVQA datasets. Thus, it is crucial to gain a better understanding of existing\nVQA datasets in order to properly evaluate the current progress in BVQA.\nTowards this goal, we conduct a first-of-its-kind computational analysis of VQA\ndatasets via designing minimalistic BVQA models. By minimalistic, we restrict\nour family of BVQA models to build only upon basic blocks: a video preprocessor\n(for aggressive spatiotemporal downsampling), a spatial quality analyzer, an\noptional temporal quality analyzer, and a quality regressor, all with the\nsimplest possible instantiations. By comparing the quality prediction\nperformance of different model variants on eight VQA datasets with realistic\ndistortions, we find that nearly all datasets suffer from the easy dataset\nproblem of varying severity, some of which even admit blind image quality\nassessment (BIQA) solutions. We additionally justify our claims by contrasting\nour model generalizability on these VQA datasets, and by ablating a dizzying\nset of BVQA design choices related to the basic building blocks. Our results\ncast doubt on the current progress in BVQA, and meanwhile shed light on good\npractices of constructing next-generation VQA datasets and models.\n","authors":["Wei Sun","Wen Wen","Xiongkuo Min","Long Lan","Guangtao Zhai","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2307.13981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05152v2","updated":"2023-07-26T05:12:03Z","published":"2023-05-09T03:33:12Z","title":"Who is Speaking Actually? Robust and Versatile Speaker Traceability for\n  Voice Conversion","summary":"  Voice conversion (VC), as a voice style transfer technology, is becoming\nincreasingly prevalent while raising serious concerns about its illegal use.\nProactively tracing the origins of VC-generated speeches, i.e., speaker\ntraceability, can prevent the misuse of VC, but unfortunately has not been\nextensively studied. In this paper, we are the first to investigate the speaker\ntraceability for VC and propose a traceable VC framework named VoxTracer. Our\nVoxTracer is similar to but beyond the paradigm of audio watermarking. We first\nuse unique speaker embedding to represent speaker identity. Then we design a\nVAE-Glow structure, in which the hiding process imperceptibly integrates the\nsource speaker identity into the VC, and the tracing process accurately\nrecovers the source speaker identity and even the source speech in spite of\nsevere speech quality degradation. To address the speech mismatch between the\nhiding and tracing processes affected by different distortions, we also adopt\nan asynchronous training strategy to optimize the VAE-Glow models. The\nVoxTracer is versatile enough to be applied to arbitrary VC methods and popular\naudio coding standards. Extensive experiments demonstrate that the VoxTracer\nachieves not only high imperceptibility in hiding, but also nearly 100% tracing\naccuracy against various types of audio lossy compressions (AAC, MP3, Opus and\nSILK) with a broad range of bitrates (16 kbps - 128 kbps) even in a very short\ntime duration (0.74s). Our speech demo is available at\nhttps://anonymous.4open.science/w/DEMOofVoxTracer.\n","authors":["Yanzhen Ren","Hongcheng Zhu","Liming Zhai","Zongkun Sun","Rubing Shen","Lina Wang"],"pdf_url":"https://arxiv.org/pdf/2305.05152v2.pdf","comment":"has been accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2206.02070v2","updated":"2023-07-26T03:53:19Z","published":"2022-06-04T23:33:34Z","title":"Priors in Deep Image Restoration and Enhancement: A Survey","summary":"  Image restoration and enhancement is a process of improving the image quality\nby removing degradations, such as noise, blur, and resolution degradation. Deep\nlearning (DL) has recently been applied to image restoration and enhancement.\nDue to its ill-posed property, plenty of works have been explored priors to\nfacilitate training deep neural networks (DNNs). However, the importance of\npriors has not been systematically studied and analyzed by far in the research\ncommunity. Therefore, this paper serves as the first study that provides a\ncomprehensive overview of recent advancements in priors for deep image\nrestoration and enhancement. Our work covers five primary contents: (1) A\ntheoretical analysis of priors for deep image restoration and enhancement; (2)\nA hierarchical and structural taxonomy of priors commonly used in the DL-based\nmethods; (3) An insightful discussion on each prior regarding its principle,\npotential, and applications; (4) A summary of crucial problems by highlighting\nthe potential future directions, especially adopting the large-scale foundation\nmodels as prior, to spark more research in the community; (5) An open-source\nrepository that provides a taxonomy of all mentioned works and code links.\n","authors":["Yunfan Lu","Yiqi Lin","Hao Wu","Yunhao Luo","Xu Zheng","Hui Xiong","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2206.02070v2.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2307.04192v2","updated":"2023-07-26T02:29:03Z","published":"2023-07-09T14:54:30Z","title":"SAS Video-QA: Self-Adaptive Sampling for Efficient Video\n  Question-Answering","summary":"  Video question--answering is a fundamental task in the field of video\nunderstanding. Although current vision--language models (VLMs) equipped with\nVideo Transformers have enabled temporal modeling and yielded superior results,\nthey are at the cost of huge computational power and thus too expensive to\ndeploy in real-time application scenarios. An economical workaround only\nsamples a small portion of frames to represent the main content of that video\nand tune an image--text model on these sampled frames. Recent video\nunderstanding models usually randomly sample a set of frames or clips,\nregardless of internal correlations between their visual contents, nor their\nrelevance to the problem. We argue that such kinds of aimless sampling may omit\nthe key frames from which the correct answer can be deduced, and the situation\ngets worse when the sampling sparsity increases, which always happens as the\nvideo lengths increase. To mitigate this issue, we propose two frame sampling\nstrategies, namely the most domain frames (MDF) and most implied frames (MIF),\nto maximally preserve those frames that are most likely vital to the given\nquestions. MDF passively minimizes the risk of key frame omission in a\nbootstrap manner, while MIS actively searches key frames customized for each\nvideo--question pair with the assistance of auxiliary models. The experimental\nresults on three public datasets from three advanced VLMs (CLIP, GIT and\nAll-in-one) demonstrate that our proposed strategies can boost the performance\nfor image--text pretrained models. The source codes pertaining to the method\nproposed in this paper are publicly available at\nhttps://github.com/declare-lab/sas-vqa.\n","authors":["Wei Han","Hui Chen","Min-Yen Kan","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2307.04192v2.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.14491v1","updated":"2023-07-26T20:30:34Z","published":"2023-07-26T20:30:34Z","title":"Modality-Agnostic Audio-Visual Deepfake Detection","summary":"  As AI-generated content (AIGC) thrives, Deepfakes have expanded from\nsingle-modality falsification to cross-modal fake content creation, where\neither audio or visual components can be manipulated. While using two unimodal\ndetectors can detect audio-visual deepfakes, cross-modal forgery clues could be\noverlooked. Existing multimodal deepfake detection methods typically establish\ncorrespondence between the audio and visual modalities for binary real/fake\nclassification, and require the co-occurrence of both modalities. However, in\nreal-world multi-modal applications, missing modality scenarios may occur where\neither modality is unavailable. In such cases, audio-visual detection methods\nare less practical than two independent unimodal methods. Consequently, the\ndetector can not always obtain the number or type of manipulated modalities\nbeforehand, necessitating a fake-modality-agnostic audio-visual detector. In\nthis work, we propose a unified fake-modality-agnostic scenarios framework that\nenables the detection of multimodal deepfakes and handles missing modalities\ncases, no matter the manipulation hidden in audio, video, or even cross-modal\nforms. To enhance the modeling of cross-modal forgery clues, we choose\naudio-visual speech recognition (AVSR) as a preceding task, which effectively\nextracts speech correlation across modalities, which is difficult for deepfakes\nto reproduce. Additionally, we propose a dual-label detection approach that\nfollows the structure of AVSR to support the independent detection of each\nmodality. Extensive experiments show that our scheme not only outperforms other\nstate-of-the-art binary detection methods across all three audio-visual\ndatasets but also achieves satisfying performance on detection\nmodality-agnostic audio/video fakes. Moreover, it even surpasses the joint use\nof two unimodal methods in the presence of missing modality cases.\n","authors":["Cai Yu","Peng Chen","Jiahe Tian","Jin Liu","Jiao Dai","Xi Wang","Yesheng Chai","Jizhong Han"],"pdf_url":"https://arxiv.org/pdf/2307.14491v1.pdf","comment":null}]},"2023-07-27T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.15054v1","updated":"2023-07-27T17:57:57Z","published":"2023-07-27T17:57:57Z","title":"A Geometric Notion of Causal Probing","summary":"  Large language models rely on real-valued representations of text to make\ntheir predictions. These representations contain information learned from the\ndata that the model has trained on, including knowledge of linguistic\nproperties and forms of demographic bias, e.g., based on gender. A growing body\nof work has considered information about concepts such as these using\northogonal projections onto subspaces of the representation space. We\ncontribute to this body of work by proposing a formal definition of intrinsic\ninformation in a subspace of a language model's representation space. We\npropose a counterfactual approach that avoids the failure mode of spurious\ncorrelations (Kumar et al., 2022) by treating components in the subspace and\nits orthogonal complement independently. We show that our counterfactual notion\nof information in a subspace is optimizing by an causal concept subspace.\nFurthermore, this intervention allows us to attempt concept controlled\ngeneration by manipulating the value of the conceptual component of a\nrepresentation. Empirically, we find that R-LACE (Ravfogel et al., 2022)\nreturns a one-dimensional subspace containing roughly half of total concept\ninformation under our framework. Our causal controlled intervention shows that,\nfor at least one model, the subspace returned by R-LACE can be used to\nmanipulate the concept value of the generated word with precision.\n","authors":["Clément Guerner","Anej Svete","Tianyu Liu","Alexander Warstadt","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2307.15054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15051v1","updated":"2023-07-27T17:56:56Z","published":"2023-07-27T17:56:56Z","title":"Matching Patients to Clinical Trials with Large Language Models","summary":"  Clinical trials are vital in advancing drug development and evidence-based\nmedicine, but their success is often hindered by challenges in patient\nrecruitment. In this work, we investigate the potential of large language\nmodels (LLMs) to assist individual patients and referral physicians in\nidentifying suitable clinical trials from an extensive selection. Specifically,\nwe introduce TrialGPT, a novel architecture employing LLMs to predict\ncriterion-level eligibility with detailed explanations, which are then\naggregated for ranking and excluding candidate clinical trials based on\nfree-text patient notes. We evaluate TrialGPT on three publicly available\ncohorts of 184 patients and 18,238 annotated clinical trials. The experimental\nresults demonstrate several key findings: First, TrialGPT achieves high\ncriterion-level prediction accuracy with faithful explanations. Second, the\naggregated trial-level TrialGPT scores are highly correlated with expert\neligibility annotations. Third, these scores prove effective in ranking\nclinical trials and exclude ineligible candidates. Our error analysis suggests\nthat current LLMs still make some mistakes due to limited medical knowledge and\ndomain-specific context understanding. Nonetheless, we believe the explanatory\ncapabilities of LLMs are highly valuable. Future research is warranted on how\nsuch AI assistants can be integrated into the routine trial matching workflow\nin real-world settings to improve its efficiency.\n","authors":["Qiao Jin","Zifeng Wang","Charalampos S. Floudas","Jimeng Sun","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.15051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00866v2","updated":"2023-07-27T17:55:41Z","published":"2023-07-03T09:08:06Z","title":"Mining Clues from Incomplete Utterance: A Query-enhanced Network for\n  Incomplete Utterance Rewriting","summary":"  Incomplete utterance rewriting has recently raised wide attention. However,\nprevious works do not consider the semantic structural information between\nincomplete utterance and rewritten utterance or model the semantic structure\nimplicitly and insufficiently. To address this problem, we propose a\nQUEry-Enhanced Network (QUEEN). Firstly, our proposed query template explicitly\nbrings guided semantic structural knowledge between the incomplete utterance\nand the rewritten utterance making model perceive where to refer back to or\nrecover omitted tokens. Then, we adopt a fast and effective edit operation\nscoring network to model the relation between two tokens. Benefiting from\nproposed query template and the well-designed edit operation scoring network,\nQUEEN achieves state-of-the-art performance on several public datasets.\n","authors":["Shuzheng Si","Shuang Zeng","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2307.00866v2.pdf","comment":"NAACL 2022"},{"id":"http://arxiv.org/abs/2307.15043v1","updated":"2023-07-27T17:49:12Z","published":"2023-07-27T17:49:12Z","title":"Universal and Transferable Adversarial Attacks on Aligned Language\n  Models","summary":"  Because \"out-of-the-box\" large language models are capable of generating a\ngreat deal of objectionable content, recent work has focused on aligning these\nmodels in an attempt to prevent undesirable generation. While there has been\nsome success at circumventing these measures -- so-called \"jailbreaks\" against\nLLMs -- these attacks have required significant human ingenuity and are brittle\nin practice. In this paper, we propose a simple and effective attack method\nthat causes aligned language models to generate objectionable behaviors.\nSpecifically, our approach finds a suffix that, when attached to a wide range\nof queries for an LLM to produce objectionable content, aims to maximize the\nprobability that the model produces an affirmative response (rather than\nrefusing to answer). However, instead of relying on manual engineering, our\napproach automatically produces these adversarial suffixes by a combination of\ngreedy and gradient-based search techniques, and also improves over past\nautomatic prompt generation methods.\n  Surprisingly, we find that the adversarial prompts generated by our approach\nare quite transferable, including to black-box, publicly released LLMs.\nSpecifically, we train an adversarial attack suffix on multiple prompts (i.e.,\nqueries asking for many different types of objectionable content), as well as\nmultiple models (in our case, Vicuna-7B and 13B). When doing so, the resulting\nattack suffix is able to induce objectionable content in the public interfaces\nto ChatGPT, Bard, and Claude, as well as open source LLMs such as LLaMA-2-Chat,\nPythia, Falcon, and others. In total, this work significantly advances the\nstate-of-the-art in adversarial attacks against aligned language models,\nraising important questions about how such systems can be prevented from\nproducing objectionable information. Code is available at\ngithub.com/llm-attacks/llm-attacks.\n","authors":["Andy Zou","Zifan Wang","J. Zico Kolter","Matt Fredrikson"],"pdf_url":"https://arxiv.org/pdf/2307.15043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15020v1","updated":"2023-07-27T17:24:09Z","published":"2023-07-27T17:24:09Z","title":"SuperCLUE: A Comprehensive Chinese Large Language Model Benchmark","summary":"  Large language models (LLMs) have shown the potential to be integrated into\nhuman daily lives. Therefore, user preference is the most critical criterion\nfor assessing LLMs' performance in real-world scenarios. However, existing\nbenchmarks mainly focus on measuring models' accuracy using multi-choice\nquestions, which limits the understanding of their capabilities in real\napplications. We fill this gap by proposing a comprehensive Chinese benchmark\nSuperCLUE, named after another popular Chinese LLM benchmark CLUE. SuperCLUE\nencompasses three sub-tasks: actual users' queries and ratings derived from an\nLLM battle platform (CArena), open-ended questions with single and\nmultiple-turn dialogues (OPEN), and closed-ended questions with the same stems\nas open-ended single-turn ones (CLOSE). Our study shows that accuracy on\nclosed-ended questions is insufficient to reflect human preferences achieved on\nopen-ended ones. At the same time, they can complement each other to predict\nactual user preferences. We also demonstrate that GPT-4 is a reliable judge to\nautomatically evaluate human preferences on open-ended questions in a Chinese\ncontext. Our benchmark will be released at https://www.CLUEbenchmarks.com\n","authors":["Liang Xu","Anqi Li","Lei Zhu","Hang Xue","Changtai Zhu","Kangkang Zhao","Haonan He","Xuanwei Zhang","Qiyue Kang","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2307.15020v1.pdf","comment":"13 pages, 12 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.15002v1","updated":"2023-07-27T16:57:32Z","published":"2023-07-27T16:57:32Z","title":"Gzip versus bag-of-words for text classification with KNN","summary":"  The effectiveness of compression distance in KNN-based text classification\n('gzip') has recently garnered lots of attention. In this note, we show that\nsimilar or better effectiveness can be achieved with simpler means, and text\ncompression may not be necessary. Indeed, we find that a simple 'bag-of-words'\nmatching can achieve similar or better accuracy, and is more efficient.\n","authors":["Juri Opitz"],"pdf_url":"https://arxiv.org/pdf/2307.15002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00017v4","updated":"2023-07-27T16:47:26Z","published":"2023-05-30T15:15:40Z","title":"Towards Explainable and Language-Agnostic LLMs: Symbolic Reverse\n  Engineering of Language at Scale","summary":"  Large language models (LLMs) have achieved a milestone that undenia-bly\nchanged many held beliefs in artificial intelligence (AI). However, there\nremains many limitations of these LLMs when it comes to true language\nunderstanding, limitations that are a byproduct of the under-lying architecture\nof deep neural networks. Moreover, and due to their subsymbolic nature,\nwhatever knowledge these models acquire about how language works will always be\nburied in billions of microfeatures (weights), none of which is meaningful on\nits own, making such models hopelessly unexplainable. To address these\nlimitations, we suggest com-bining the strength of symbolic representations\nwith what we believe to be the key to the success of LLMs, namely a successful\nbottom-up re-verse engineering of language at scale. As such we argue for a\nbottom-up reverse engineering of language in a symbolic setting. Hints on what\nthis project amounts to have been suggested by several authors, and we discuss\nin some detail here how this project could be accomplished.\n","authors":["Walid S. Saba"],"pdf_url":"https://arxiv.org/pdf/2306.00017v4.pdf","comment":"Draft, preprint"},{"id":"http://arxiv.org/abs/2307.14995v1","updated":"2023-07-27T16:45:33Z","published":"2023-07-27T16:45:33Z","title":"Scaling TransNormer to 175 Billion Parameters","summary":"  We present TransNormerLLM, the first linear attention-based Large Language\nModel (LLM) that outperforms conventional softmax attention-based models in\nterms of both accuracy and efficiency. TransNormerLLM evolves from the previous\nlinear attention architecture TransNormer by making advanced modifications that\ninclude positional embedding, linear attention acceleration, gating mechanism,\ntensor normalization, inference acceleration and stabilization. Specifically,\nwe use LRPE together with an exponential decay to avoid attention dilution\nissues while allowing the model to retain global interactions between tokens.\nAdditionally, we propose Lightning Attention, a cutting-edge technique that\naccelerates linear attention by more than twice in runtime and reduces memory\nusage by a remarkable four times. To further enhance the performance of\nTransNormer, we leverage a gating mechanism to smooth training and a new tensor\nnormalization scheme to accelerate the model, resulting in an impressive\nacceleration of over 20%. Furthermore, we have developed a robust inference\nalgorithm that ensures numerical stability and consistent inference speed,\nregardless of the sequence length, showcasing superior efficiency during both\ntraining and inference stages. Scalability is at the heart of our model's\ndesign, enabling seamless deployment on large-scale clusters and facilitating\nexpansion to even more extensive models, all while maintaining outstanding\nperformance metrics. Rigorous validation of our model design is achieved\nthrough a series of comprehensive experiments on our self-collected corpus,\nboasting a size exceeding 6TB and containing over 2 trillion tokens. To ensure\ndata quality and relevance, we implement a new self-cleaning strategy to filter\nour collected data. Our pre-trained models will be released to foster community\nadvancements in efficient LLMs.\n","authors":["Zhen Qin","Dong Li","Weigao Sun","Weixuan Sun","Xuyang Shen","Xiaodong Han","Yunshen Wei","Baohong Lv","Fei Yuan","Xiao Luo","Yu Qiao","Yiran Zhong"],"pdf_url":"https://arxiv.org/pdf/2307.14995v1.pdf","comment":"Technical Report. Yiran Zhong is the corresponding author. Zhen Qin,\n  Dong Li, Weigao Sun, Weixuan Sun, Xuyang Shen contribute equally to this\n  paper"},{"id":"http://arxiv.org/abs/2307.14988v1","updated":"2023-07-27T16:30:27Z","published":"2023-07-27T16:30:27Z","title":"Incrementally-Computable Neural Networks: Efficient Inference for\n  Dynamic Inputs","summary":"  Deep learning often faces the challenge of efficiently processing dynamic\ninputs, such as sensor data or user inputs. For example, an AI writing\nassistant is required to update its suggestions in real time as a document is\nedited. Re-running the model each time is expensive, even with compression\ntechniques like knowledge distillation, pruning, or quantization. Instead, we\ntake an incremental computing approach, looking to reuse calculations as the\ninputs change. However, the dense connectivity of conventional architectures\nposes a major obstacle to incremental computation, as even minor input changes\ncascade through the network and restrict information reuse. To address this, we\nuse vector quantization to discretize intermediate values in the network, which\nfilters out noisy and unnecessary modifications to hidden neurons, facilitating\nthe reuse of their values. We apply this approach to the transformers\narchitecture, creating an efficient incremental inference algorithm with\ncomplexity proportional to the fraction of the modified inputs. Our experiments\nwith adapting the OPT-125M pre-trained language model demonstrate comparable\naccuracy on document classification while requiring 12.1X (median) fewer\noperations for processing sequences of atomic edits.\n","authors":["Or Sharir","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2307.14988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14936v1","updated":"2023-07-27T15:28:29Z","published":"2023-07-27T15:28:29Z","title":"PanGu-Coder2: Boosting Large Language Models for Code with Ranking\n  Feedback","summary":"  Large Language Models for Code (Code LLM) are flourishing. New and powerful\nmodels are released on a weekly basis, demonstrating remarkable performance on\nthe code generation task. Various approaches have been proposed to boost the\ncode generation performance of pre-trained Code LLMs, such as supervised\nfine-tuning, instruction tuning, reinforcement learning, etc. In this paper, we\npropose a novel RRTF (Rank Responses to align Test&Teacher Feedback) framework,\nwhich can effectively and efficiently boost pre-trained large language models\nfor code generation. Under this framework, we present PanGu-Coder2, which\nachieves 62.20% pass@1 on the OpenAI HumanEval benchmark. Furthermore, through\nan extensive evaluation on CoderEval and LeetCode benchmarks, we show that\nPanGu-Coder2 consistently outperforms all previous Code LLMs.\n","authors":["Bo Shen","Jiaxin Zhang","Taihong Chen","Daoguang Zan","Bing Geng","An Fu","Muhan Zeng","Ailun Yu","Jichuan Ji","Jingyang Zhao","Yuenan Guo","Qianxiang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14936v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2305.16731v4","updated":"2023-07-27T15:09:38Z","published":"2023-05-26T08:33:28Z","title":"Automatic Emotion Experiencer Recognition","summary":"  The most prominent subtask in emotion analysis is emotion classification; to\nassign a category to a textual unit, for instance a social media post. Many\nresearch questions from the social sciences do, however, not only require the\ndetection of the emotion of an author of a post but to understand who is\nascribed an emotion in text. This task is tackled by emotion role labeling\nwhich aims at extracting who is described in text to experience an emotion,\nwhy, and towards whom. This could, however, be considered overly sophisticated\nif the main question to answer is who feels which emotion. A targeted approach\nfor such setup is to classify emotion experiencer mentions (aka \"emoters\")\nregarding the emotion they presumably perceive. This task is similar to named\nentity recognition of person names with the difference that not every mentioned\nentity name is an emoter. While, very recently, data with emoter annotations\nhas been made available, no experiments have yet been performed to detect such\nmentions. With this paper, we provide baseline experiments to understand how\nchallenging the task is. We further evaluate the impact on experiencer-specific\nemotion categorization and appraisal detection in a pipeline, when gold\nmentions are not available. We show that experiencer detection in text is a\nchallenging task, with a precision of .82 and a recall of .56 (F1 =.66). These\nresults motivate future work of jointly modeling emoter spans and\nemotion/appraisal predictions.\n","authors":["Maximilian Wegge","Roman Klinger"],"pdf_url":"https://arxiv.org/pdf/2305.16731v4.pdf","comment":"accepted to the CPSS workshop at KONVENS"},{"id":"http://arxiv.org/abs/2307.14913v1","updated":"2023-07-27T14:56:06Z","published":"2023-07-27T14:56:06Z","title":"ARC-NLP at PAN 2023: Transition-Focused Natural Language Inference for\n  Writing Style Detection","summary":"  The task of multi-author writing style detection aims at finding any\npositions of writing style change in a given text document. We formulate the\ntask as a natural language inference problem where two consecutive paragraphs\nare paired. Our approach focuses on transitions between paragraphs while\ntruncating input tokens for the task. As backbone models, we employ different\nTransformer-based encoders with warmup phase during training. We submit the\nmodel version that outperforms baselines and other proposed model versions in\nour experiments. For the easy and medium setups, we submit transition-focused\nnatural language inference based on DeBERTa with warmup training, and the same\nmodel without transition for the hard setup.\n","authors":["Izzet Emre Kucukkaya","Umitcan Sahin","Cagri Toraman"],"pdf_url":"https://arxiv.org/pdf/2307.14913v1.pdf","comment":"Accepted by PAN at CLEF 2023"},{"id":"http://arxiv.org/abs/2307.14912v1","updated":"2023-07-27T14:55:10Z","published":"2023-07-27T14:55:10Z","title":"ARC-NLP at PAN 2023: Hierarchical Long Text Classification for Trigger\n  Detection","summary":"  Fanfiction, a popular form of creative writing set within established\nfictional universes, has gained a substantial online following. However,\nensuring the well-being and safety of participants has become a critical\nconcern in this community. The detection of triggering content, material that\nmay cause emotional distress or trauma to readers, poses a significant\nchallenge. In this paper, we describe our approach for the Trigger Detection\nshared task at PAN CLEF 2023, where we want to detect multiple triggering\ncontent in a given Fanfiction document. For this, we build a hierarchical model\nthat uses recurrence over Transformer-based language models. In our approach,\nwe first split long documents into smaller sized segments and use them to\nfine-tune a Transformer model. Then, we extract feature embeddings from the\nfine-tuned Transformer model, which are used as input in the training of\nmultiple LSTM models for trigger detection in a multi-label setting. Our model\nachieves an F1-macro score of 0.372 and F1-micro score of 0.736 on the\nvalidation set, which are higher than the baseline results shared at PAN CLEF\n2023.\n","authors":["Umitcan Sahin","Izzet Emre Kucukkaya","Cagri Toraman"],"pdf_url":"https://arxiv.org/pdf/2307.14912v1.pdf","comment":"Accepted by PAN at CLEF 2023"},{"id":"http://arxiv.org/abs/2307.00610v2","updated":"2023-07-27T14:54:13Z","published":"2023-07-02T16:35:54Z","title":"Fraunhofer SIT at CheckThat! 2023: Mixing Single-Modal Classifiers to\n  Estimate the Check-Worthiness of Multi-Modal Tweets","summary":"  The option of sharing images, videos and audio files on social media opens up\nnew possibilities for distinguishing between false information and fake news on\nthe Internet. Due to the vast amount of data shared every second on social\nmedia, not all data can be verified by a computer or a human expert. Here, a\ncheck-worthiness analysis can be used as a first step in the fact-checking\npipeline and as a filtering mechanism to improve efficiency. This paper\nproposes a novel way of detecting the check-worthiness in multi-modal tweets.\nIt takes advantage of two classifiers, each trained on a single modality. For\nimage data, extracting the embedded text with an OCR analysis has shown to\nperform best. By combining the two classifiers, the proposed solution was able\nto place first in the CheckThat! 2023 Task 1A with an F1 score of 0.7297\nachieved on the private test set.\n","authors":["Raphael Frick","Inna Vogel"],"pdf_url":"https://arxiv.org/pdf/2307.00610v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2307.02377v2","updated":"2023-07-27T14:43:56Z","published":"2023-07-03T09:27:46Z","title":"Fraunhofer SIT at CheckThat! 2023: Tackling Classification Uncertainty\n  Using Model Souping on the Example of Check-Worthiness Classification","summary":"  This paper describes the second-placed approach developed by the Fraunhofer\nSIT team in the CLEF-2023 CheckThat! lab Task 1B for English. Given a text\nsnippet from a political debate, the aim of this task is to determine whether\nit should be assessed for check-worthiness. Detecting check-worthy statements\naims to facilitate manual fact-checking efforts by prioritizing the claims that\nfact-checkers should consider first. It can also be considered as primary step\nof a fact-checking system. Our best-performing method took advantage of an\nensemble classification scheme centered on Model Souping. When applied to the\nEnglish data set, our submitted model achieved an overall F1 score of 0.878 and\nwas ranked as the second-best model in the competition.\n","authors":["Raphael Frick","Inna Vogel","Jeong-Eun Choi"],"pdf_url":"https://arxiv.org/pdf/2307.02377v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2307.14899v1","updated":"2023-07-27T14:42:16Z","published":"2023-07-27T14:42:16Z","title":"Retrieval-based Text Selection for Addressing Class-Imbalanced Data in\n  Classification","summary":"  This paper addresses the problem of selecting of a set of texts for\nannotation in text classification using retrieval methods when there are limits\non the number of annotations due to constraints on human resources. An\nadditional challenge addressed is dealing with binary categories that have a\nsmall number of positive instances, reflecting severe class imbalance. In our\nsituation, where annotation occurs over a long time period, the selection of\ntexts to be annotated can be made in batches, with previous annotations guiding\nthe choice of the next set. To address these challenges, the paper proposes\nleveraging SHAP to construct a quality set of queries for Elasticsearch and\nsemantic search, to try to identify optimal sets of texts for annotation that\nwill help with class imbalance. The approach is tested on sets of cue texts\ndescribing possible future events, constructed by participants involved in\nstudies aimed to help with the management of obesity and diabetes. We introduce\nan effective method for selecting a small set of texts for annotation and\nbuilding high-quality classifiers. We integrate vector search, semantic search,\nand machine learning classifiers to yield a good solution. Our experiments\ndemonstrate improved F1 scores for the minority classes in binary\nclassification.\n","authors":["Sareh Ahmadi","Aditya Shah","Edward Fox"],"pdf_url":"https://arxiv.org/pdf/2307.14899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14878v1","updated":"2023-07-27T14:09:59Z","published":"2023-07-27T14:09:59Z","title":"MESED: A Multi-modal Entity Set Expansion Dataset with Fine-grained\n  Semantic Classes and Hard Negative Entities","summary":"  The Entity Set Expansion (ESE) task aims to expand a handful of seed entities\nwith new entities belonging to the same semantic class. Conventional ESE\nmethods are based on mono-modality (i.e., literal modality), which struggle to\ndeal with complex entities in the real world such as: (1) Negative entities\nwith fine-grained semantic differences. (2) Synonymous entities. (3) Polysemous\nentities. (4) Long-tailed entities. These challenges prompt us to propose\nMulti-modal Entity Set Expansion (MESE), where models integrate information\nfrom multiple modalities to represent entities. Intuitively, the benefits of\nmulti-modal information for ESE are threefold: (1) Different modalities can\nprovide complementary information. (2) Multi-modal information provides a\nunified signal via common visual properties for the same semantic class or\nentity. (3) Multi-modal information offers robust alignment signal for\nsynonymous entities. To assess the performance of model in MESE and facilitate\nfurther research, we constructed the MESED dataset which is the first\nmulti-modal dataset for ESE with large-scale and elaborate manual calibration.\nA powerful multi-modal model MultiExpan is proposed which is pre-trained on\nfour multimodal pre-training tasks. The extensive experiments and analyses on\nMESED demonstrate the high quality of the dataset and the effectiveness of our\nMultiExpan, as well as pointing the direction for future research.\n","authors":["Yangning Li","Tingwei Lu","Yinghui Li","Tianyu Yu","Shulin Huang","Hai-Tao Zheng","Rui Zhang","Jun Yuan"],"pdf_url":"https://arxiv.org/pdf/2307.14878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14856v1","updated":"2023-07-27T13:37:06Z","published":"2023-07-27T13:37:06Z","title":"Exploiting the Potential of Seq2Seq Models as Robust Few-Shot Learners","summary":"  In-context learning, which offers substantial advantages over fine-tuning, is\npredominantly observed in decoder-only models, while encoder-decoder (i.e.,\nseq2seq) models excel in methods that rely on weight updates. Recently, a few\nstudies have demonstrated the feasibility of few-shot learning with seq2seq\nmodels; however, this has been limited to tasks that align well with the\nseq2seq architecture, such as summarization and translation. Inspired by these\ninitial studies, we provide a first-ever extensive experiment comparing the\nin-context few-shot learning capabilities of decoder-only and encoder-decoder\nmodels on a broad range of tasks. Furthermore, we propose two methods to more\neffectively elicit in-context learning ability in seq2seq models:\nobjective-aligned prompting and a fusion-based approach. Remarkably, our\napproach outperforms a decoder-only model that is six times larger and exhibits\nsignificant performance improvements compared to conventional seq2seq models\nacross a variety of settings. We posit that, with the right configuration and\nprompt design, seq2seq models can be highly effective few-shot learners for a\nwide spectrum of applications.\n","authors":["Jihyeon Lee","Dain Kim","Doohae Jung","Boseop Kim","Kyoung-Woon On"],"pdf_url":"https://arxiv.org/pdf/2307.14856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14852v1","updated":"2023-07-27T13:31:45Z","published":"2023-07-27T13:31:45Z","title":"ArcGPT: A Large Language Model Tailored for Real-world Archival\n  Applications","summary":"  Archives play a crucial role in preserving information and knowledge, and the\nexponential growth of such data necessitates efficient and automated tools for\nmanaging and utilizing archive information resources. Archival applications\ninvolve managing massive data that are challenging to process and analyze.\nAlthough LLMs have made remarkable progress in diverse domains, there are no\npublicly available archives tailored LLM. Addressing this gap, we introduce\nArcGPT, to our knowledge, the first general-purpose LLM tailored to the\narchival field. To enhance model performance on real-world archival tasks,\nArcGPT has been pre-trained on massive and extensive archival domain data.\nAlongside ArcGPT, we release AMBLE, a benchmark comprising four real-world\narchival tasks. Evaluation on AMBLE shows that ArcGPT outperforms existing\nstate-of-the-art models, marking a substantial step forward in effective\narchival data management. Ultimately, ArcGPT aims to better serve the archival\ncommunity, aiding archivists in their crucial role of preserving and harnessing\nour collective information and knowledge.\n","authors":["Shitou Zhang","Jingrui Hou","Siyuan Peng","Zuchao Li","Qibiao Hu","Ping Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14850v1","updated":"2023-07-27T13:28:31Z","published":"2023-07-27T13:28:31Z","title":"Turkish Native Language Identification","summary":"  In this paper, we present the first application of Native Language\nIdentification (NLI) for the Turkish language. NLI involves predicting the\nwriter's first language by analysing their writing in different languages.\nWhile most NLI research has focused on English, our study extends its scope to\nTurkish. We used the recently constructed Turkish Learner Corpus and employed a\ncombination of three syntactic features (CFG production rules, part-of-speech\nn-grams and function words) with L2 texts to demonstrate their effectiveness in\nthis task.\n","authors":["Ahmet Yavuz Uluslu","Gerold Schneider"],"pdf_url":"https://arxiv.org/pdf/2307.14850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13693v2","updated":"2023-07-27T12:58:59Z","published":"2023-07-25T17:57:18Z","title":"Evaluating Large Language Models for Radiology Natural Language\n  Processing","summary":"  The rise of large language models (LLMs) has marked a pivotal shift in the\nfield of natural language processing (NLP). LLMs have revolutionized a\nmultitude of domains, and they have made a significant impact in the medical\nfield. Large language models are now more abundant than ever, and many of these\nmodels exhibit bilingual capabilities, proficient in both English and Chinese.\nHowever, a comprehensive evaluation of these models remains to be conducted.\nThis lack of assessment is especially apparent within the context of radiology\nNLP. This study seeks to bridge this gap by critically evaluating thirty two\nLLMs in interpreting radiology reports, a crucial component of radiology NLP.\nSpecifically, the ability to derive impressions from radiologic findings is\nassessed. The outcomes of this evaluation provide key insights into the\nperformance, strengths, and weaknesses of these LLMs, informing their practical\napplications within the medical domain.\n","authors":["Zhengliang Liu","Tianyang Zhong","Yiwei Li","Yutong Zhang","Yi Pan","Zihao Zhao","Peixin Dong","Chao Cao","Yuxiao Liu","Peng Shu","Yaonai Wei","Zihao Wu","Chong Ma","Jiaqi Wang","Sheng Wang","Mengyue Zhou","Zuowei Jiang","Chunlin Li","Jason Holmes","Shaochen Xu","Lu Zhang","Haixing Dai","Kai Zhang","Lin Zhao","Yuanhao Chen","Xu Liu","Peilong Wang","Pingkun Yan","Jun Liu","Bao Ge","Lichao Sun","Dajiang Zhu","Xiang Li","Wei Liu","Xiaoyan Cai","Xintao Hu","Xi Jiang","Shu Zhang","Xin Zhang","Tuo Zhang","Shijie Zhao","Quanzheng Li","Hongtu Zhu","Dinggang Shen","Tianming Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13693v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14818v1","updated":"2023-07-27T12:51:16Z","published":"2023-07-27T12:51:16Z","title":"What Makes a Good Paraphrase: Do Automated Evaluations Work?","summary":"  Paraphrasing is the task of expressing an essential idea or meaning in\ndifferent words. But how different should the words be in order to be\nconsidered an acceptable paraphrase? And can we exclusively use automated\nmetrics to evaluate the quality of a paraphrase? We attempt to answer these\nquestions by conducting experiments on a German data set and performing\nautomatic and expert linguistic evaluation.\n","authors":["Anna Moskvina","Bhushan Kotnis","Chris Catacata","Michael Janz","Nasrin Saef"],"pdf_url":"https://arxiv.org/pdf/2307.14818v1.pdf","comment":"Extended Abstract for Konvens2023"},{"id":"http://arxiv.org/abs/2307.14817v1","updated":"2023-07-27T12:46:38Z","published":"2023-07-27T12:46:38Z","title":"Models of reference production: How do they withstand the test of time?","summary":"  In recent years, many NLP studies have focused solely on performance\nimprovement. In this work, we focus on the linguistic and scientific aspects of\nNLP. We use the task of generating referring expressions in context\n(REG-in-context) as a case study and start our analysis from GREC, a\ncomprehensive set of shared tasks in English that addressed this topic over a\ndecade ago. We ask what the performance of models would be if we assessed them\n(1) on more realistic datasets, and (2) using more advanced methods. We test\nthe models using different evaluation metrics and feature selection\nexperiments. We conclude that GREC can no longer be regarded as offering a\nreliable assessment of models' ability to mimic human reference production,\nbecause the results are highly impacted by the choice of corpus and evaluation\nmetrics. Our results also suggest that pre-trained language models are less\ndependent on the choice of corpus than classic Machine Learning models, and\ntherefore make more robust class predictions.\n","authors":["Fahime Same","Guanyi Chen","Kees van Deemter"],"pdf_url":"https://arxiv.org/pdf/2307.14817v1.pdf","comment":"Accepted to INLG 2023"},{"id":"http://arxiv.org/abs/2307.14785v1","updated":"2023-07-27T11:28:16Z","published":"2023-07-27T11:28:16Z","title":"Improving Aspect-Based Sentiment with End-to-End Semantic Role Labeling\n  Model","summary":"  This paper presents a series of approaches aimed at enhancing the performance\nof Aspect-Based Sentiment Analysis (ABSA) by utilizing extracted semantic\ninformation from a Semantic Role Labeling (SRL) model. We propose a novel\nend-to-end Semantic Role Labeling model that effectively captures most of the\nstructured semantic information within the Transformer hidden state. We believe\nthat this end-to-end model is well-suited for our newly proposed models that\nincorporate semantic information. We evaluate the proposed models in two\nlanguages, English and Czech, employing ELECTRA-small models. Our combined\nmodels improve ABSA performance in both languages. Moreover, we achieved new\nstate-of-the-art results on the Czech ABSA.\n","authors":["Pavel Přibáň","Ondřej Pražák"],"pdf_url":"https://arxiv.org/pdf/2307.14785v1.pdf","comment":"Accepted to RANLP 2023"},{"id":"http://arxiv.org/abs/2307.14783v1","updated":"2023-07-27T11:24:47Z","published":"2023-07-27T11:24:47Z","title":"Emotion4MIDI: a Lyrics-based Emotion-Labeled Symbolic Music Dataset","summary":"  We present a new large-scale emotion-labeled symbolic music dataset\nconsisting of 12k MIDI songs. To create this dataset, we first trained emotion\nclassification models on the GoEmotions dataset, achieving state-of-the-art\nresults with a model half the size of the baseline. We then applied these\nmodels to lyrics from two large-scale MIDI datasets. Our dataset covers a wide\nrange of fine-grained emotions, providing a valuable resource to explore the\nconnection between music and emotions and, especially, to develop models that\ncan generate music based on specific emotions. Our code for inference, trained\nmodels, and datasets are available online.\n","authors":["Serkan Sulun","Pedro Oliveira","Paula Viana"],"pdf_url":"https://arxiv.org/pdf/2307.14783v1.pdf","comment":"Accepted to 22nd EPIA Conference on Artificial Intelligence (2023)"},{"id":"http://arxiv.org/abs/2307.13365v2","updated":"2023-07-27T10:17:18Z","published":"2023-07-25T09:34:42Z","title":"Empower Your Model with Longer and Better Context Comprehension","summary":"  Recently, with the emergence of numerous Large Language Models (LLMs), the\nimplementation of AI has entered a new era. Irrespective of these models' own\ncapacity and structure, there is a growing demand for LLMs to possess enhanced\ncomprehension of longer and more complex contexts with relatively smaller\nsizes. Models often encounter an upper limit when processing sequences of\nsentences that extend beyond their comprehension capacity and result in\noff-topic or even chaotic responses. While several recent works attempt to\naddress this issue in various ways, they rarely focus on \"why models are unable\nto compensate or strengthen their capabilities on their own\". In this paper, we\nthoroughly investigate the nature of information transfer within LLMs and\npropose a novel technique called Attention Transition. This technique empowers\nmodels to achieve longer and better context comprehension with minimal\nadditional training or impact on generation fluency. Our experiments are\nconducted on the challenging XSum dataset using LLaMa-7b model with context\ntoken length ranging from 800 to 1900. Results demonstrate that we achieve\nsubstantial improvements compared with the original generation results\nevaluated by GPT4.\n","authors":["Yifei Gao","Lei Wang","Jun Fang","Longhua Hu","Jun Cheng"],"pdf_url":"https://arxiv.org/pdf/2307.13365v2.pdf","comment":"LLM for long context comprehension"},{"id":"http://arxiv.org/abs/2305.12485v2","updated":"2023-07-27T10:06:49Z","published":"2023-05-21T15:31:23Z","title":"A Confidence-based Partial Label Learning Model for Crowd-Annotated\n  Named Entity Recognition","summary":"  Existing models for named entity recognition (NER) are mainly based on\nlarge-scale labeled datasets, which always obtain using crowdsourcing. However,\nit is hard to obtain a unified and correct label via majority voting from\nmultiple annotators for NER due to the large labeling space and complexity of\nthis task. To address this problem, we aim to utilize the original\nmulti-annotator labels directly. Particularly, we propose a Confidence-based\nPartial Label Learning (CPLL) method to integrate the prior confidence (given\nby annotators) and posterior confidences (learned by models) for\ncrowd-annotated NER. This model learns a token- and content-dependent\nconfidence via an Expectation-Maximization (EM) algorithm by minimizing\nempirical risk. The true posterior estimator and confidence estimator perform\niteratively to update the true posterior and confidence respectively. We\nconduct extensive experimental results on both real-world and synthetic\ndatasets, which show that our model can improve performance effectively\ncompared with strong baselines.\n","authors":["Limao Xiong","Jie Zhou","Qunxi Zhu","Xiao Wang","Yuanbin Wu","Qi Zhang","Tao Gui","Xuanjing Huang","Jin Ma","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2305.12485v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14743v1","updated":"2023-07-27T10:00:05Z","published":"2023-07-27T10:00:05Z","title":"Turning Whisper into Real-Time Transcription System","summary":"  Whisper is one of the recent state-of-the-art multilingual speech recognition\nand translation models, however, it is not designed for real time\ntranscription. In this paper, we build on top of Whisper and create\nWhisper-Streaming, an implementation of real-time speech transcription and\ntranslation of Whisper-like models. Whisper-Streaming uses local agreement\npolicy with self-adaptive latency to enable streaming transcription. We show\nthat Whisper-Streaming achieves high quality and 3.3 seconds latency on\nunsegmented long-form speech transcription test set, and we demonstrate its\nrobustness and practical usability as a component in live transcription service\nat a multilingual conference.\n","authors":["Dominik Macháček","Raj Dabre","Ondřej Bojar"],"pdf_url":"https://arxiv.org/pdf/2307.14743v1.pdf","comment":"system demonstration pre-print"},{"id":"http://arxiv.org/abs/2301.11596v5","updated":"2023-07-27T09:37:35Z","published":"2023-01-27T08:45:53Z","title":"ThoughtSource: A central hub for large language model reasoning data","summary":"  Large language models (LLMs) such as GPT-4 have recently demonstrated\nimpressive results across a wide range of tasks. LLMs are still limited,\nhowever, in that they frequently fail at complex reasoning, their reasoning\nprocesses are opaque, they are prone to 'hallucinate' facts, and there are\nconcerns about their underlying biases. Letting models verbalize reasoning\nsteps as natural language, a technique known as chain-of-thought prompting, has\nrecently been proposed as a way to address some of these issues. Here we\npresent ThoughtSource, a meta-dataset and software library for chain-of-thought\n(CoT) reasoning. The goal of ThoughtSource is to improve future artificial\nintelligence systems by facilitating qualitative understanding of CoTs,\nenabling empirical evaluations, and providing training data. This first release\nof ThoughtSource integrates seven scientific/medical, three general-domain and\nfive math word question answering datasets.\n","authors":["Simon Ott","Konstantin Hebenstreit","Valentin Liévin","Christoffer Egeberg Hother","Milad Moradi","Maximilian Mayrhauser","Robert Praas","Ole Winther","Matthias Samwald"],"pdf_url":"https://arxiv.org/pdf/2301.11596v5.pdf","comment":"Revision: added datasets, formatting"},{"id":"http://arxiv.org/abs/2302.00083v2","updated":"2023-07-27T09:15:00Z","published":"2023-01-31T20:26:16Z","title":"In-Context Retrieval-Augmented Language Models","summary":"  Retrieval-Augmented Language Modeling (RALM) methods, which condition a\nlanguage model (LM) on relevant documents from a grounding corpus during\ngeneration, were shown to significantly improve language modeling performance.\nIn addition, they can mitigate the problem of factually inaccurate text\ngeneration and provide natural source attribution mechanism. Existing RALM\napproaches focus on modifying the LM architecture in order to facilitate the\nincorporation of external information, significantly complicating deployment.\nThis paper considers a simple alternative, which we dub In-Context RALM:\nleaving the LM architecture unchanged and prepending grounding documents to the\ninput, without any further training of the LM. We show that In-Context RALM\nthat builds on off-the-shelf general purpose retrievers provides surprisingly\nlarge LM gains across model sizes and diverse corpora. We also demonstrate that\nthe document retrieval and ranking mechanism can be specialized to the RALM\nsetting to further boost performance. We conclude that In-Context RALM has\nconsiderable potential to increase the prevalence of LM grounding, particularly\nin settings where a pretrained LM must be used without modification or even via\nAPI access.\n","authors":["Ori Ram","Yoav Levine","Itay Dalmedigos","Dor Muhlgay","Amnon Shashua","Kevin Leyton-Brown","Yoav Shoham"],"pdf_url":"https://arxiv.org/pdf/2302.00083v2.pdf","comment":"Accepted for publication in Transactions of the Association for\n  Computational Linguistics (TACL). pre-MIT Press publication version"},{"id":"http://arxiv.org/abs/2307.14712v1","updated":"2023-07-27T09:03:05Z","published":"2023-07-27T09:03:05Z","title":"Evaluating Generative Models for Graph-to-Text Generation","summary":"  Large language models (LLMs) have been widely employed for graph-to-text\ngeneration tasks. However, the process of finetuning LLMs requires significant\ntraining resources and annotation work. In this paper, we explore the\ncapability of generative models to generate descriptive text from graph data in\na zero-shot setting. Specifically, we evaluate GPT-3 and ChatGPT on two\ngraph-to-text datasets and compare their performance with that of finetuned LLM\nmodels such as T5 and BART. Our results demonstrate that generative models are\ncapable of generating fluent and coherent text, achieving BLEU scores of 10.57\nand 11.08 for the AGENDA and WebNLG datasets, respectively. However, our error\nanalysis reveals that generative models still struggle with understanding the\nsemantic relations between entities, and they also tend to generate text with\nhallucinations or irrelevant information. As a part of error analysis, we\nutilize BERT to detect machine-generated text and achieve high macro-F1 scores.\nWe have made the text generated by generative models publicly available.\n","authors":["Shuzhou Yuan","Michael Färber"],"pdf_url":"https://arxiv.org/pdf/2307.14712v1.pdf","comment":"Accepted as short paper in RANLP2023"},{"id":"http://arxiv.org/abs/2211.08411v2","updated":"2023-07-27T08:01:42Z","published":"2022-11-15T18:49:27Z","title":"Large Language Models Struggle to Learn Long-Tail Knowledge","summary":"  The Internet contains a wealth of knowledge -- from the birthdays of\nhistorical figures to tutorials on how to code -- all of which may be learned\nby language models. However, while certain pieces of information are ubiquitous\non the web, others appear extremely rarely. In this paper, we study the\nrelationship between the knowledge memorized by large language models and the\ninformation in pre-training datasets scraped from the web. In particular, we\nshow that a language model's ability to answer a fact-based question relates to\nhow many documents associated with that question were seen during pre-training.\nWe identify these relevant documents by entity linking pre-training datasets\nand counting documents that contain the same entities as a given\nquestion-answer pair. Our results demonstrate strong correlational and causal\nrelationships between accuracy and relevant document count for numerous\nquestion answering datasets (e.g., TriviaQA), pre-training corpora (e.g.,\nROOTS), and model sizes (e.g., 176B parameters). Moreover, while larger models\nare better at learning long-tail knowledge, we estimate that today's models\nmust be scaled by many orders of magnitude to reach competitive QA performance\non questions with little support in the pre-training data. Finally, we show\nthat retrieval-augmentation can reduce the dependence on relevant pre-training\ninformation, presenting a promising approach for capturing the long-tail.\n","authors":["Nikhil Kandpal","Haikang Deng","Adam Roberts","Eric Wallace","Colin Raffel"],"pdf_url":"https://arxiv.org/pdf/2211.08411v2.pdf","comment":"ICML 2023 Camera Ready Version"},{"id":"http://arxiv.org/abs/2307.14666v1","updated":"2023-07-27T07:40:11Z","published":"2023-07-27T07:40:11Z","title":"Improving Natural Language Inference in Arabic using Transformer Models\n  and Linguistically Informed Pre-Training","summary":"  This paper addresses the classification of Arabic text data in the field of\nNatural Language Processing (NLP), with a particular focus on Natural Language\nInference (NLI) and Contradiction Detection (CD). Arabic is considered a\nresource-poor language, meaning that there are few data sets available, which\nleads to limited availability of NLP methods. To overcome this limitation, we\ncreate a dedicated data set from publicly available resources. Subsequently,\ntransformer-based machine learning models are being trained and evaluated. We\nfind that a language-specific model (AraBERT) performs competitively with\nstate-of-the-art multilingual approaches, when we apply linguistically informed\npre-training methods such as Named Entity Recognition (NER). To our knowledge,\nthis is the first large-scale evaluation for this task in Arabic, as well as\nthe first application of multi-task pre-training in this context.\n","authors":["Mohammad Majd Saad Al Deen","Maren Pielka","Jörn Hees","Bouthaina Soulef Abdou","Rafet Sifa"],"pdf_url":"https://arxiv.org/pdf/2307.14666v1.pdf","comment":"submitted to IEEE SSCI 2023"},{"id":"http://arxiv.org/abs/2307.12798v3","updated":"2023-07-27T07:20:28Z","published":"2023-07-24T13:51:19Z","title":"RRAML: Reinforced Retrieval Augmented Machine Learning","summary":"  The emergence of large language models (LLMs) has revolutionized machine\nlearning and related fields, showcasing remarkable abilities in comprehending,\ngenerating, and manipulating human language. However, their conventional usage\nthrough API-based text prompt submissions imposes certain limitations in terms\nof context constraints and external source availability. To address these\nchallenges, we propose a novel framework called Reinforced Retrieval Augmented\nMachine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs\nwith supporting information retrieved by a purpose-built retriever from a vast\nuser-provided database. By leveraging recent advancements in reinforcement\nlearning, our method effectively addresses several critical challenges.\nFirstly, it circumvents the need for accessing LLM gradients. Secondly, our\nmethod alleviates the burden of retraining LLMs for specific tasks, as it is\noften impractical or impossible due to restricted access to the model and the\ncomputational intensity involved. Additionally we seamlessly link the\nretriever's task with the reasoner, mitigating hallucinations and reducing\nirrelevant, and potentially damaging retrieved documents. We believe that the\nresearch agenda outlined in this paper has the potential to profoundly impact\nthe field of AI, democratizing access to and utilization of LLMs for a wide\nrange of entities.\n","authors":["Andrea Bacciu","Florin Cuconasu","Federico Siciliano","Fabrizio Silvestri","Nicola Tonellotto","Giovanni Trappolini"],"pdf_url":"https://arxiv.org/pdf/2307.12798v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.04657v3","updated":"2023-07-27T07:14:18Z","published":"2021-08-10T13:08:34Z","title":"Differentiable Subset Pruning of Transformer Heads","summary":"  Multi-head attention, a collection of several attention mechanisms that\nindependently attend to different parts of the input, is the key ingredient in\nthe Transformer. Recent work has shown, however, that a large proportion of the\nheads in a Transformer's multi-head attention mechanism can be safely pruned\naway without significantly harming the performance of the model; such pruning\nleads to models that are noticeably smaller and faster in practice. Our work\nintroduces a new head pruning technique that we term differentiable subset\npruning. Intuitively, our method learns per-head importance variables and then\nenforces a user-specified hard constraint on the number of unpruned heads. The\nimportance variables are learned via stochastic gradient descent. We conduct\nexperiments on natural language inference and machine translation; we show that\ndifferentiable subset pruning performs comparably or better than previous works\nwhile offering precise control of the sparsity level.\n","authors":["Jiaoda Li","Ryan Cotterell","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2108.04657v3.pdf","comment":"TACL 2021"},{"id":"http://arxiv.org/abs/2307.14632v1","updated":"2023-07-27T05:45:35Z","published":"2023-07-27T05:45:35Z","title":"Metric-Based In-context Learning: A Case Study in Text Simplification","summary":"  In-context learning (ICL) for large language models has proven to be a\npowerful approach for many natural language processing tasks. However,\ndetermining the best method to select examples for ICL is nontrivial as the\nresults can vary greatly depending on the quality, quantity, and order of\nexamples used. In this paper, we conduct a case study on text simplification\n(TS) to investigate how to select the best and most robust examples for ICL. We\npropose Metric-Based in-context Learning (MBL) method that utilizes commonly\nused TS metrics such as SARI, compression ratio, and BERT-Precision for\nselection. Through an extensive set of experiments with various-sized GPT\nmodels on standard TS benchmarks such as TurkCorpus and ASSET, we show that\nexamples selected by the top SARI scores perform the best on larger models such\nas GPT-175B, while the compression ratio generally performs better on smaller\nmodels such as GPT-13B and GPT-6.7B. Furthermore, we demonstrate that MBL is\ngenerally robust to example orderings and out-of-domain test sets, and\noutperforms strong baselines and state-of-the-art finetuned language models.\nFinally, we show that the behaviour of large GPT models can be implicitly\ncontrolled by the chosen metric. Our research provides a new framework for\nselecting examples in ICL, and demonstrates its effectiveness in text\nsimplification tasks, breaking new ground for more accurate and efficient NLG\nsystems.\n","authors":["Subha Vadlamannati","Gözde Gül Şahin"],"pdf_url":"https://arxiv.org/pdf/2307.14632v1.pdf","comment":"Accepted to INLG"},{"id":"http://arxiv.org/abs/2205.14704v4","updated":"2023-07-27T04:07:02Z","published":"2022-05-29T16:07:30Z","title":"Decoupling Knowledge from Memorization: Retrieval-augmented Prompt\n  Learning","summary":"  Prompt learning approaches have made waves in natural language processing by\ninducing better few-shot performance while they still follow a parametric-based\nlearning paradigm; the oblivion and rote memorization problems in learning may\nencounter unstable generalization issues. Specifically, vanilla prompt learning\nmay struggle to utilize atypical instances by rote during fully-supervised\ntraining or overfit shallow patterns with low-shot data. To alleviate such\nlimitations, we develop RetroPrompt with the motivation of decoupling knowledge\nfrom memorization to help the model strike a balance between generalization and\nmemorization. In contrast with vanilla prompt learning, RetroPrompt constructs\nan open-book knowledge-store from training instances and implements a retrieval\nmechanism during the process of input, training and inference, thus equipping\nthe model with the ability to retrieve related contexts from the training\ncorpus as cues for enhancement. Extensive experiments demonstrate that\nRetroPrompt can obtain better performance in both few-shot and zero-shot\nsettings. Besides, we further illustrate that our proposed RetroPrompt can\nyield better generalization abilities with new datasets. Detailed analysis of\nmemorization indeed reveals RetroPrompt can reduce the reliance of language\nmodels on memorization; thus, improving generalization for downstream tasks.\nCode is available in\nhttps://github.com/zjunlp/PromptKG/tree/main/research/RetroPrompt.\n","authors":["Xiang Chen","Lei Li","Ningyu Zhang","Xiaozhuan Liang","Shumin Deng","Chuanqi Tan","Fei Huang","Luo Si","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2205.14704v4.pdf","comment":"NeurIPS 2022 (Spotlight)"},{"id":"http://arxiv.org/abs/2307.14132v2","updated":"2023-07-27T00:41:36Z","published":"2023-07-26T11:59:14Z","title":"Say Goodbye to RNN-T Loss: A Novel CIF-based Transducer Architecture for\n  Automatic Speech Recognition","summary":"  RNN-T models are widely used in ASR, which rely on the RNN-T loss to achieve\nlength alignment between input audio and target sequence. However, the\nimplementation complexity and the alignment-based optimization target of RNN-T\nloss lead to computational redundancy and a reduced role for predictor network,\nrespectively. In this paper, we propose a novel model named CIF-Transducer\n(CIF-T) which incorporates the Continuous Integrate-and-Fire (CIF) mechanism\nwith the RNN-T model to achieve efficient alignment. In this way, the RNN-T\nloss is abandoned, thus bringing a computational reduction and allowing the\npredictor network a more significant role. We also introduce Funnel-CIF,\nContext Blocks, Unified Gating and Bilinear Pooling joint network, and\nauxiliary training strategy to further improve performance. Experiments on the\n178-hour AISHELL-1 and 10000-hour WenetSpeech datasets show that CIF-T achieves\nstate-of-the-art results with lower computational overhead compared to RNN-T\nmodels.\n","authors":["Tian-Hao Zhang","Dinghao Zhou","Guiping Zhong","Baoxiang Li"],"pdf_url":"https://arxiv.org/pdf/2307.14132v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12247v3","updated":"2023-07-27T23:23:12Z","published":"2023-02-23T18:59:05Z","title":"Quantifying & Modeling Multimodal Interactions: An Information\n  Decomposition Framework","summary":"  The recent explosion of interest in multimodal applications has resulted in a\nwide selection of datasets and methods for representing and integrating\ninformation from different modalities. Despite these empirical advances, there\nremain fundamental research questions: How can we quantify the interactions\nthat are necessary to solve a multimodal task? Subsequently, what are the most\nsuitable multimodal models to capture these interactions? To answer these\nquestions, we propose an information-theoretic approach to quantify the degree\nof redundancy, uniqueness, and synergy relating input modalities with an output\ntask. We term these three measures as the PID statistics of a multimodal\ndistribution (or PID for short), and introduce two new estimators for these PID\nstatistics that scale to high-dimensional distributions. To validate PID\nestimation, we conduct extensive experiments on both synthetic datasets where\nthe PID is known and on large-scale multimodal benchmarks where PID estimations\nare compared with human annotations. Finally, we demonstrate their usefulness\nin (1) quantifying interactions within multimodal datasets, (2) quantifying\ninteractions captured by multimodal models, (3) principled approaches for model\nselection, and (4) three real-world case studies engaging with domain experts\nin pathology, mood prediction, and robotic perception where our framework helps\nto recommend strong multimodal models for each application.\n","authors":["Paul Pu Liang","Yun Cheng","Xiang Fan","Chun Kai Ling","Suzanne Nie","Richard Chen","Zihao Deng","Nicholas Allen","Randy Auerbach","Faisal Mahmood","Ruslan Salakhutdinov","Louis-Philippe Morency"],"pdf_url":"https://arxiv.org/pdf/2302.12247v3.pdf","comment":"Code available at: https://github.com/pliang279/PID"},{"id":"http://arxiv.org/abs/2307.15217v1","updated":"2023-07-27T22:29:25Z","published":"2023-07-27T22:29:25Z","title":"Open Problems and Fundamental Limitations of Reinforcement Learning from\n  Human Feedback","summary":"  Reinforcement learning from human feedback (RLHF) is a technique for training\nAI systems to align with human goals. RLHF has emerged as the central method\nused to finetune state-of-the-art large language models (LLMs). Despite this\npopularity, there has been relatively little public work systematizing its\nflaws. In this paper, we (1) survey open problems and fundamental limitations\nof RLHF and related methods; (2) overview techniques to understand, improve,\nand complement RLHF in practice; and (3) propose auditing and disclosure\nstandards to improve societal oversight of RLHF systems. Our work emphasizes\nthe limitations of RLHF and highlights the importance of a multi-faceted\napproach to the development of safer AI systems.\n","authors":["Stephen Casper","Xander Davies","Claudia Shi","Thomas Krendl Gilbert","Jérémy Scheurer","Javier Rando","Rachel Freedman","Tomasz Korbak","David Lindner","Pedro Freire","Tony Wang","Samuel Marks","Charbel-Raphaël Segerie","Micah Carroll","Andi Peng","Phillip Christoffersen","Mehul Damani","Stewart Slocum","Usman Anwar","Anand Siththaranjan","Max Nadeau","Eric J. Michaud","Jacob Pfau","Dmitrii Krasheninnikov","Xin Chen","Lauro Langosco","Peter Hase","Erdem Bıyık","Anca Dragan","David Krueger","Dorsa Sadigh","Dylan Hadfield-Menell"],"pdf_url":"https://arxiv.org/pdf/2307.15217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13867v2","updated":"2023-07-27T21:56:30Z","published":"2023-04-26T23:24:50Z","title":"Transferring Procedural Knowledge across Commonsense Tasks","summary":"  Stories about everyday situations are an essential part of human\ncommunication, motivating the need to develop AI agents that can reliably\nunderstand these stories. Despite the long list of supervised methods for story\ncompletion and procedural understanding, current AI has no mechanisms to\nautomatically track and explain procedures in unseen stories. To bridge this\ngap, we study the ability of AI models to transfer procedural knowledge to\nnovel narrative tasks in a transparent manner. We design LEAP: a comprehensive\nframework that integrates state-of-the-art modeling architectures, training\nregimes, and augmentation strategies based on both natural and synthetic\nstories. To address the lack of densely annotated training data, we devise a\nrobust automatic labeler based on few-shot prompting to enhance the augmented\ndata. Our experiments with in- and out-of-domain tasks reveal insights into the\ninterplay of different architectures, training regimes, and augmentation\nstrategies. LEAP's labeler has a clear positive impact on out-of-domain\ndatasets, while the resulting dense annotation provides native explainability.\n","authors":["Yifan Jiang","Filip Ilievski","Kaixin Ma"],"pdf_url":"https://arxiv.org/pdf/2304.13867v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15199v1","updated":"2023-07-27T21:14:46Z","published":"2023-07-27T21:14:46Z","title":"PromptStyler: Prompt-driven Style Generation for Source-free Domain\n  Generalization","summary":"  In a joint vision-language space, a text feature (e.g., from \"a photo of a\ndog\") could effectively represent its relevant image features (e.g., from dog\nphotos). Inspired by this, we propose PromptStyler which simulates various\ndistribution shifts in the joint space by synthesizing diverse styles via\nprompts without using any images to deal with source-free domain\ngeneralization. Our method learns to generate a variety of style features (from\n\"a S* style of a\") via learnable style word vectors for pseudo-words S*. To\nensure that learned styles do not distort content information, we force\nstyle-content features (from \"a S* style of a [class]\") to be located nearby\ntheir corresponding content features (from \"[class]\") in the joint\nvision-language space. After learning style word vectors, we train a linear\nclassifier using synthesized style-content features. PromptStyler achieves the\nstate of the art on PACS, VLCS, OfficeHome and DomainNet, although it does not\nrequire any images and takes just ~30 minutes for training using a single GPU.\n","authors":["Junhyeong Cho","Gilhyun Nam","Sungyeon Kim","Hunmin Yang","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2307.15199v1.pdf","comment":"Accepted to ICCV 2023, Project Page: https://promptstyler.github.io/"},{"id":"http://arxiv.org/abs/2307.15190v1","updated":"2023-07-27T20:39:06Z","published":"2023-07-27T20:39:06Z","title":"f-Divergence Minimization for Sequence-Level Knowledge Distillation","summary":"  Knowledge distillation (KD) is the process of transferring knowledge from a\nlarge model to a small one. It has gained increasing attention in the natural\nlanguage processing community, driven by the demands of compressing\never-growing language models. In this work, we propose an f-DISTILL framework,\nwhich formulates sequence-level knowledge distillation as minimizing a\ngeneralized f-divergence function. We propose four distilling variants under\nour framework and show that existing SeqKD and ENGINE approaches are\napproximations of our f-DISTILL methods. We further derive step-wise\ndecomposition for our f-DISTILL, reducing intractable sequence-level divergence\nto word-level losses that can be computed in a tractable manner. Experiments\nacross four datasets show that our methods outperform existing KD approaches,\nand that our symmetric distilling losses can better force the student to learn\nfrom the teacher distribution.\n","authors":["Yuqiao Wen","Zichao Li","Wenyu Du","Lili Mou"],"pdf_url":"https://arxiv.org/pdf/2307.15190v1.pdf","comment":"Accepted by ACL 2023"},{"id":"http://arxiv.org/abs/2304.07810v2","updated":"2023-07-27T20:24:42Z","published":"2023-04-16T15:29:03Z","title":"VISAR: A Human-AI Argumentative Writing Assistant with Visual\n  Programming and Rapid Draft Prototyping","summary":"  In argumentative writing, writers must brainstorm hierarchical writing goals,\nensure the persuasiveness of their arguments, and revise and organize their\nplans through drafting. Recent advances in large language models (LLMs) have\nmade interactive text generation through a chat interface (e.g., ChatGPT)\npossible. However, this approach often neglects implicit writing context and\nuser intent, lacks support for user control and autonomy, and provides limited\nassistance for sensemaking and revising writing plans. To address these\nchallenges, we introduce VISAR, an AI-enabled writing assistant system designed\nto help writers brainstorm and revise hierarchical goals within their writing\ncontext, organize argument structures through synchronized text editing and\nvisual programming, and enhance persuasiveness with argumentation spark\nrecommendations. VISAR allows users to explore, experiment with, and validate\ntheir writing plans using automatic draft prototyping. A controlled lab study\nconfirmed the usability and effectiveness of VISAR in facilitating the\nargumentative writing planning process.\n","authors":["Zheng Zhang","Jie Gao","Ranjodh Singh Dhaliwal","Toby Jia-Jun Li"],"pdf_url":"https://arxiv.org/pdf/2304.07810v2.pdf","comment":"30 pages, published in UIST'23"},{"id":"http://arxiv.org/abs/2307.15176v1","updated":"2023-07-27T20:11:07Z","published":"2023-07-27T20:11:07Z","title":"RCT Rejection Sampling for Causal Estimation Evaluation","summary":"  Confounding is a significant obstacle to unbiased estimation of causal\neffects from observational data. For settings with high-dimensional covariates\n-- such as text data, genomics, or the behavioral social sciences --\nresearchers have proposed methods to adjust for confounding by adapting machine\nlearning methods to the goal of causal estimation. However, empirical\nevaluation of these adjustment methods has been challenging and limited. In\nthis work, we build on a promising empirical evaluation strategy that\nsimplifies evaluation design and uses real data: subsampling randomized\ncontrolled trials (RCTs) to create confounded observational datasets while\nusing the average causal effects from the RCTs as ground-truth. We contribute a\nnew sampling algorithm, which we call RCT rejection sampling, and provide\ntheoretical guarantees that causal identification holds in the observational\ndata to allow for valid comparisons to the ground-truth RCT. Using synthetic\ndata, we show our algorithm indeed results in low bias when oracle estimators\nare evaluated on the confounded samples, which is not always the case for a\npreviously proposed algorithm. In addition to this identification result, we\nhighlight several finite data considerations for evaluation designers who plan\nto use RCT rejection sampling on their own datasets. As a proof of concept, we\nimplement an example evaluation pipeline and walk through these finite data\nconsiderations with a novel, real-world RCT -- which we release publicly --\nconsisting of approximately 70k observations and text data as high-dimensional\ncovariates. Together, these contributions build towards a broader agenda of\nimproved empirical evaluation for causal estimation.\n","authors":["Katherine A. Keith","Sergey Feldman","David Jurgens","Jonathan Bragg","Rohit Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2307.15176v1.pdf","comment":"Code and data at https://github.com/kakeith/rct_rejection_sampling"},{"id":"http://arxiv.org/abs/2211.17046v2","updated":"2023-07-27T19:44:05Z","published":"2022-11-30T14:47:14Z","title":"Rationale-Guided Few-Shot Classification to Detect Abusive Language","summary":"  Abusive language is a concerning problem in online social media. Past\nresearch on detecting abusive language covers different platforms, languages,\ndemographies, etc. However, models trained using these datasets do not perform\nwell in cross-domain evaluation settings. To overcome this, a common strategy\nis to use a few samples from the target domain to train models to get better\nperformance in that domain (cross-domain few-shot training). However, this\nmight cause the models to overfit the artefacts of those samples. A compelling\nsolution could be to guide the models toward rationales, i.e., spans of text\nthat justify the text's label. This method has been found to improve model\nperformance in the in-domain setting across various NLP tasks. In this paper,\nwe propose RGFS (Rationale-Guided Few-Shot Classification) for abusive language\ndetection. We first build a multitask learning setup to jointly learn\nrationales, targets, and labels, and find a significant improvement of 6% macro\nF1 on the rationale detection task over training solely rationale classifiers.\nWe introduce two rationale-integrated BERT-based architectures (the RGFS\nmodels) and evaluate our systems over five different abusive language datasets,\nfinding that in the few-shot classification setting, RGFS-based models\noutperform baseline models by about 7% in macro F1 scores and perform\ncompetitively to models finetuned on other source domains. Furthermore,\nRGFS-based models outperform LIME/SHAP-based approaches in terms of\nplausibility and are close in performance in terms of faithfulness.\n","authors":["Punyajoy Saha","Divyanshu Sheth","Kushal Kedia","Binny Mathew","Animesh Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2211.17046v2.pdf","comment":"11 pages, 14 tables, 3 figures, The code repository is\n  https://github.com/punyajoy/RGFS_ECAI"},{"id":"http://arxiv.org/abs/2307.15164v1","updated":"2023-07-27T19:42:22Z","published":"2023-07-27T19:42:22Z","title":"VISU at WASSA 2023 Shared Task: Detecting Emotions in Reaction to News\n  Stories Leveraging BERT and Stacked Embeddings","summary":"  Our system, VISU, participated in the WASSA 2023 Shared Task (3) of Emotion\nClassification from essays written in reaction to news articles. Emotion\ndetection from complex dialogues is challenging and often requires\ncontext/domain understanding. Therefore in this research, we have focused on\ndeveloping deep learning (DL) models using the combination of word embedding\nrepresentations with tailored prepossessing strategies to capture the nuances\nof emotions expressed. Our experiments used static and contextual embeddings\n(individual and stacked) with Bidirectional Long short-term memory (BiLSTM) and\nTransformer based models. We occupied rank tenth in the emotion detection task\nby scoring a Macro F1-Score of 0.2717, validating the efficacy of our\nimplemented approaches for small and imbalanced datasets with mixed categories\nof target emotions.\n","authors":["Vivek Kumar","Sushmita Singh","Prayag Tiwari"],"pdf_url":"https://arxiv.org/pdf/2307.15164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.04253v2","updated":"2023-07-27T18:13:40Z","published":"2023-01-11T00:22:56Z","title":"Towards Answering Climate Questionnaires from Unstructured Climate\n  Reports","summary":"  The topic of Climate Change (CC) has received limited attention in NLP\ndespite its urgency. Activists and policymakers need NLP tools to effectively\nprocess the vast and rapidly growing unstructured textual climate reports into\nstructured form. To tackle this challenge we introduce two new large-scale\nclimate questionnaire datasets and use their existing structure to train\nself-supervised models. We conduct experiments to show that these models can\nlearn to generalize to climate disclosures of different organizations types\nthan seen during training. We then use these models to help align texts from\nunstructured climate documents to the semi-structured questionnaires in a human\npilot study. Finally, to support further NLP research in the climate domain we\nintroduce a benchmark of existing climate text classification datasets to\nbetter evaluate and compare existing models.\n","authors":["Daniel Spokoyny","Tanmay Laud","Tom Corringham","Taylor Berg-Kirkpatrick"],"pdf_url":"https://arxiv.org/pdf/2301.04253v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15097v1","updated":"2023-07-27T13:45:42Z","published":"2023-07-27T13:45:42Z","title":"Cascaded Cross-Modal Transformer for Request and Complaint Detection","summary":"  We propose a novel cascaded cross-modal transformer (CCMT) that combines\nspeech and text transcripts to detect customer requests and complaints in phone\nconversations. Our approach leverages a multimodal paradigm by transcribing the\nspeech using automatic speech recognition (ASR) models and translating the\ntranscripts into different languages. Subsequently, we combine\nlanguage-specific BERT-based models with Wav2Vec2.0 audio features in a novel\ncascaded cross-attention transformer model. We apply our system to the Requests\nSub-Challenge of the ACM Multimedia 2023 Computational Paralinguistics\nChallenge, reaching unweighted average recalls (UAR) of 65.41% and 85.87% for\nthe complaint and request classes, respectively.\n","authors":["Nicolae-Catalin Ristea","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2307.15097v1.pdf","comment":"Accepted at ACMMM 2023"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.15063v1","updated":"2023-07-27T17:59:59Z","published":"2023-07-27T17:59:59Z","title":"To Adapt or Not to Adapt? Real-Time Adaptation for Semantic Segmentation","summary":"  The goal of Online Domain Adaptation for semantic segmentation is to handle\nunforeseeable domain changes that occur during deployment, like sudden weather\nevents. However, the high computational costs associated with brute-force\nadaptation make this paradigm unfeasible for real-world applications. In this\npaper we propose HAMLET, a Hardware-Aware Modular Least Expensive Training\nframework for real-time domain adaptation. Our approach includes a\nhardware-aware back-propagation orchestration agent (HAMT) and a dedicated\ndomain-shift detector that enables active control over when and how the model\nis adapted (LT). Thanks to these advancements, our approach is capable of\nperforming semantic segmentation while simultaneously adapting at more than\n29FPS on a single consumer-grade GPU. Our framework's encouraging accuracy and\nspeed trade-off is demonstrated on OnDA and SHIFT benchmarks through\nexperimental results.\n","authors":["Marc Botet Colomer","Pier Luigi Dovesi","Theodoros Panagiotakopoulos","Joao Frederico Carvalho","Linus Härenstam-Nielsen","Hossein Azizpour","Hedvig Kjellström","Daniel Cremers","Matteo Poggi"],"pdf_url":"https://arxiv.org/pdf/2307.15063v1.pdf","comment":"ICCV 2023. The first two authors contributed equally. Project page:\n  https://marcbotet.github.io/hamlet-web/"},{"id":"http://arxiv.org/abs/2307.15064v1","updated":"2023-07-27T17:59:59Z","published":"2023-07-27T17:59:59Z","title":"Self-Supervised Visual Acoustic Matching","summary":"  Acoustic matching aims to re-synthesize an audio clip to sound as if it were\nrecorded in a target acoustic environment. Existing methods assume access to\npaired training data, where the audio is observed in both source and target\nenvironments, but this limits the diversity of training data or requires the\nuse of simulated data or heuristics to create paired samples. We propose a\nself-supervised approach to visual acoustic matching where training samples\ninclude only the target scene image and audio -- without acoustically\nmismatched source audio for reference. Our approach jointly learns to\ndisentangle room acoustics and re-synthesize audio into the target environment,\nvia a conditional GAN framework and a novel metric that quantifies the level of\nresidual acoustic information in the de-biased audio. Training with either\nin-the-wild web data or simulated data, we demonstrate it outperforms the\nstate-of-the-art on multiple challenging datasets and a wide variety of\nreal-world audio and environments.\n","authors":["Arjun Somayazulu","Changan Chen","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2307.15064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15061v1","updated":"2023-07-27T17:59:56Z","published":"2023-07-27T17:59:56Z","title":"The RoboDepth Challenge: Methods and Advancements Towards Robust Depth\n  Estimation","summary":"  Accurate depth estimation under out-of-distribution (OoD) scenarios, such as\nadverse weather conditions, sensor failure, and noise contamination, is\ndesirable for safety-critical applications. Existing depth estimation systems,\nhowever, suffer inevitably from real-world corruptions and perturbations and\nare struggled to provide reliable depth predictions under such cases. In this\npaper, we summarize the winning solutions from the RoboDepth Challenge -- an\nacademic competition designed to facilitate and advance robust OoD depth\nestimation. This challenge was developed based on the newly established KITTI-C\nand NYUDepth2-C benchmarks. We hosted two stand-alone tracks, with an emphasis\non robust self-supervised and robust fully-supervised depth estimation,\nrespectively. Out of more than two hundred participants, nine unique and\ntop-performing solutions have appeared, with novel designs ranging from the\nfollowing aspects: spatial- and frequency-domain augmentations, masked image\nmodeling, image restoration and super-resolution, adversarial training,\ndiffusion-based noise suppression, vision-language pre-training, learned model\nensembling, and hierarchical feature enhancement. Extensive experimental\nanalyses along with insightful observations are drawn to better understand the\nrationale behind each design. We hope this challenge could lay a solid\nfoundation for future research on robust and reliable depth estimation and\nbeyond. The datasets, competition toolkit, workshop recordings, and source code\nfrom the winning teams are publicly available on the challenge website.\n","authors":["Lingdong Kong","Yaru Niu","Shaoyuan Xie","Hanjiang Hu","Lai Xing Ng","Benoit R. Cottereau","Ding Zhao","Liangjun Zhang","Hesheng Wang","Wei Tsang Ooi","Ruijie Zhu","Ziyang Song","Li Liu","Tianzhu Zhang","Jun Yu","Mohan Jing","Pengwei Li","Xiaohua Qi","Cheng Jin","Yingfeng Chen","Jie Hou","Jie Zhang","Zhen Kan","Qiang Ling","Liang Peng","Minglei Li","Di Xu","Changpeng Yang","Yuanqi Yao","Gang Wu","Jian Kuai","Xianming Liu","Junjun Jiang","Jiamian Huang","Baojun Li","Jiale Chen","Shuang Zhang","Sun Ao","Zhenyu Li","Runze Chen","Haiyong Luo","Fang Zhao","Jingze Yu"],"pdf_url":"https://arxiv.org/pdf/2307.15061v1.pdf","comment":"Technical Report; 65 pages, 34 figures, 24 tables; Code at\n  https://github.com/ldkong1205/RoboDepth"},{"id":"http://arxiv.org/abs/2307.15058v1","updated":"2023-07-27T17:59:52Z","published":"2023-07-27T17:59:52Z","title":"MARS: An Instance-aware, Modular and Realistic Simulator for Autonomous\n  Driving","summary":"  Nowadays, autonomous cars can drive smoothly in ordinary cases, and it is\nwidely recognized that realistic sensor simulation will play a critical role in\nsolving remaining corner cases by simulating them. To this end, we propose an\nautonomous driving simulator based upon neural radiance fields (NeRFs).\nCompared with existing works, ours has three notable features: (1)\nInstance-aware. Our simulator models the foreground instances and background\nenvironments separately with independent networks so that the static (e.g.,\nsize and appearance) and dynamic (e.g., trajectory) properties of instances can\nbe controlled separately. (2) Modular. Our simulator allows flexible switching\nbetween different modern NeRF-related backbones, sampling strategies, input\nmodalities, etc. We expect this modular design to boost academic progress and\nindustrial deployment of NeRF-based autonomous driving simulation. (3)\nRealistic. Our simulator set new state-of-the-art photo-realism results given\nthe best module selection. Our simulator will be open-sourced while most of our\ncounterparts are not. Project page: https://open-air-sun.github.io/mars/.\n","authors":["Zirui Wu","Tianyu Liu","Liyi Luo","Zhide Zhong","Jianteng Chen","Hongmin Xiao","Chao Hou","Haozhe Lou","Yuantao Chen","Runyi Yang","Yuxin Huang","Xiaoyu Ye","Zike Yan","Yongliang Shi","Yiyi Liao","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.15058v1.pdf","comment":"CICAI 2023, project page with code:\n  https://open-air-sun.github.io/mars/"},{"id":"http://arxiv.org/abs/2307.15055v1","updated":"2023-07-27T17:58:11Z","published":"2023-07-27T17:58:11Z","title":"PointOdyssey: A Large-Scale Synthetic Dataset for Long-Term Point\n  Tracking","summary":"  We introduce PointOdyssey, a large-scale synthetic dataset, and data\ngeneration framework, for the training and evaluation of long-term fine-grained\ntracking algorithms. Our goal is to advance the state-of-the-art by placing\nemphasis on long videos with naturalistic motion. Toward the goal of\nnaturalism, we animate deformable characters using real-world motion capture\ndata, we build 3D scenes to match the motion capture environments, and we\nrender camera viewpoints using trajectories mined via structure-from-motion on\nreal videos. We create combinatorial diversity by randomizing character\nappearance, motion profiles, materials, lighting, 3D assets, and atmospheric\neffects. Our dataset currently includes 104 videos, averaging 2,000 frames\nlong, with orders of magnitude more correspondence annotations than prior work.\nWe show that existing methods can be trained from scratch in our dataset and\noutperform the published variants. Finally, we introduce modifications to the\nPIPs point tracking method, greatly widening its temporal receptive field,\nwhich improves its performance on PointOdyssey as well as on two real-world\nbenchmarks. Our data and code are publicly available at:\nhttps://pointodyssey.com\n","authors":["Yang Zheng","Adam W. Harley","Bokui Shen","Gordon Wetzstein","Leonidas J. Guibas"],"pdf_url":"https://arxiv.org/pdf/2307.15055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15052v1","updated":"2023-07-27T17:57:06Z","published":"2023-07-27T17:57:06Z","title":"Learning Depth Estimation for Transparent and Mirror Surfaces","summary":"  Inferring the depth of transparent or mirror (ToM) surfaces represents a hard\nchallenge for either sensors, algorithms, or deep networks. We propose a simple\npipeline for learning to estimate depth properly for such surfaces with neural\nnetworks, without requiring any ground-truth annotation. We unveil how to\nobtain reliable pseudo labels by in-painting ToM objects in images and\nprocessing them with a monocular depth estimation model. These labels can be\nused to fine-tune existing monocular or stereo networks, to let them learn how\nto deal with ToM surfaces. Experimental results on the Booster dataset show the\ndramatic improvements enabled by our remarkably simple proposal.\n","authors":["Alex Costanzino","Pierluigi Zama Ramirez","Matteo Poggi","Fabio Tosi","Stefano Mattoccia","Luigi Di Stefano"],"pdf_url":"https://arxiv.org/pdf/2307.15052v1.pdf","comment":"Accepted at ICCV 2023. Project Page:\n  https://cvlab-unibo.github.io/Depth4ToM"},{"id":"http://arxiv.org/abs/2307.15049v1","updated":"2023-07-27T17:56:05Z","published":"2023-07-27T17:56:05Z","title":"Regularized Mask Tuning: Uncovering Hidden Knowledge in Pre-trained\n  Vision-Language Models","summary":"  Prompt tuning and adapter tuning have shown great potential in transferring\npre-trained vision-language models (VLMs) to various downstream tasks. In this\nwork, we design a new type of tuning method, termed as regularized mask tuning,\nwhich masks the network parameters through a learnable selection. Inspired by\nneural pathways, we argue that the knowledge required by a downstream task\nalready exists in the pre-trained weights but just gets concealed in the\nupstream pre-training stage. To bring the useful knowledge back into light, we\nfirst identify a set of parameters that are important to a given downstream\ntask, then attach a binary mask to each parameter, and finally optimize these\nmasks on the downstream data with the parameters frozen. When updating the\nmask, we introduce a novel gradient dropout strategy to regularize the\nparameter selection, in order to prevent the model from forgetting old\nknowledge and overfitting the downstream data. Experimental results on 11\ndatasets demonstrate the consistent superiority of our method over previous\nalternatives. It is noteworthy that we manage to deliver 18.73% performance\nimprovement compared to the zero-shot CLIP via masking an average of only 2.56%\nparameters. Furthermore, our method is synergistic with most existing\nparameter-efficient tuning methods and can boost the performance on top of\nthem. Project page can be found here (https://wuw2019.github.io/RMT/).\n","authors":["Kecheng Zheng","Wei Wu","Ruili Feng","Kai Zhu","Jiawei Liu","Deli Zhao","Zheng-Jun Zha","Wei Chen","Yujun Shen"],"pdf_url":"https://arxiv.org/pdf/2307.15049v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.15045v1","updated":"2023-07-27T17:51:52Z","published":"2023-07-27T17:51:52Z","title":"A Transformer-based Approach for Arabic Offline Handwritten Text\n  Recognition","summary":"  Handwriting recognition is a challenging and critical problem in the fields\nof pattern recognition and machine learning, with applications spanning a wide\nrange of domains. In this paper, we focus on the specific issue of recognizing\noffline Arabic handwritten text. Existing approaches typically utilize a\ncombination of convolutional neural networks for image feature extraction and\nrecurrent neural networks for temporal modeling, with connectionist temporal\nclassification used for text generation. However, these methods suffer from a\nlack of parallelization due to the sequential nature of recurrent neural\nnetworks. Furthermore, these models cannot account for linguistic rules,\nnecessitating the use of an external language model in the post-processing\nstage to boost accuracy. To overcome these issues, we introduce two alternative\narchitectures, namely the Transformer Transducer and the standard\nsequence-to-sequence Transformer, and compare their performance in terms of\naccuracy and speed. Our approach can model language dependencies and relies\nonly on the attention mechanism, thereby making it more parallelizable and less\ncomplex. We employ pre-trained Transformers for both image understanding and\nlanguage modeling. Our evaluation on the Arabic KHATT dataset demonstrates that\nour proposed method outperforms the current state-of-the-art approaches for\nrecognizing offline Arabic handwritten text.\n","authors":["Saleh Momeni","Bagher BabaAli"],"pdf_url":"https://arxiv.org/pdf/2307.15045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15042v1","updated":"2023-07-27T17:48:44Z","published":"2023-07-27T17:48:44Z","title":"TEDi: Temporally-Entangled Diffusion for Long-Term Motion Synthesis","summary":"  The gradual nature of a diffusion process that synthesizes samples in small\nincrements constitutes a key ingredient of Denoising Diffusion Probabilistic\nModels (DDPM), which have presented unprecedented quality in image synthesis\nand been recently explored in the motion domain. In this work, we propose to\nadapt the gradual diffusion concept (operating along a diffusion time-axis)\ninto the temporal-axis of the motion sequence. Our key idea is to extend the\nDDPM framework to support temporally varying denoising, thereby entangling the\ntwo axes. Using our special formulation, we iteratively denoise a motion buffer\nthat contains a set of increasingly-noised poses, which auto-regressively\nproduces an arbitrarily long stream of frames. With a stationary diffusion\ntime-axis, in each diffusion step we increment only the temporal-axis of the\nmotion such that the framework produces a new, clean frame which is removed\nfrom the beginning of the buffer, followed by a newly drawn noise vector that\nis appended to it. This new mechanism paves the way towards a new framework for\nlong-term motion synthesis with applications to character animation and other\ndomains.\n","authors":["Zihan Zhang","Richard Liu","Kfir Aberman","Rana Hanocka"],"pdf_url":"https://arxiv.org/pdf/2307.15042v1.pdf","comment":"Project page: https://threedle.github.io/TEDi/"},{"id":"http://arxiv.org/abs/2307.15033v1","updated":"2023-07-27T17:41:36Z","published":"2023-07-27T17:41:36Z","title":"Diverse Inpainting and Editing with GAN Inversion","summary":"  Recent inversion methods have shown that real images can be inverted into\nStyleGAN's latent space and numerous edits can be achieved on those images\nthanks to the semantically rich feature representations of well-trained GAN\nmodels. However, extensive research has also shown that image inversion is\nchallenging due to the trade-off between high-fidelity reconstruction and\neditability. In this paper, we tackle an even more difficult task, inverting\nerased images into GAN's latent space for realistic inpaintings and editings.\nFurthermore, by augmenting inverted latent codes with different latent samples,\nwe achieve diverse inpaintings. Specifically, we propose to learn an encoder\nand mixing network to combine encoded features from erased images with\nStyleGAN's mapped features from random samples. To encourage the mixing network\nto utilize both inputs, we train the networks with generated data via a novel\nset-up. We also utilize higher-rate features to prevent color inconsistencies\nbetween the inpainted and unerased parts. We run extensive experiments and\ncompare our method with state-of-the-art inversion and inpainting methods.\nQualitative metrics and visual comparisons show significant improvements.\n","authors":["Ahmet Burak Yildirim","Hamza Pehlivan","Bahri Batuhan Bilecen","Aysegul Dundar"],"pdf_url":"https://arxiv.org/pdf/2307.15033v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.15029v1","updated":"2023-07-27T17:37:56Z","published":"2023-07-27T17:37:56Z","title":"Adaptive Segmentation Network for Scene Text Detection","summary":"  Inspired by deep convolution segmentation algorithms, scene text detectors\nbreak the performance ceiling of datasets steadily. However, these methods\noften encounter threshold selection bottlenecks and have poor performance on\ntext instances with extreme aspect ratios. In this paper, we propose to\nautomatically learn the discriminate segmentation threshold, which\ndistinguishes text pixels from background pixels for segmentation-based scene\ntext detectors and then further reduces the time-consuming manual parameter\nadjustment. Besides, we design a Global-information Enhanced Feature Pyramid\nNetwork (GE-FPN) for capturing text instances with macro size and extreme\naspect ratios. Following the GE-FPN, we introduce a cascade optimization\nstructure to further refine the text instances. Finally, together with the\nproposed threshold learning strategy and text detection structure, we design an\nAdaptive Segmentation Network (ASNet) for scene text detection. Extensive\nexperiments are carried out to demonstrate that the proposed ASNet can achieve\nthe state-of-the-art performance on four text detection benchmarks, i.e., ICDAR\n2015, MSRA-TD500, ICDAR 2017 MLT and CTW1500. The ablation experiments also\nverify the effectiveness of our contributions.\n","authors":["Guiqin Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.15029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15019v1","updated":"2023-07-27T17:22:41Z","published":"2023-07-27T17:22:41Z","title":"Self-Supervised Graph Transformer for Deepfake Detection","summary":"  Deepfake detection methods have shown promising results in recognizing\nforgeries within a given dataset, where training and testing take place on the\nin-distribution dataset. However, their performance deteriorates significantly\nwhen presented with unseen samples. As a result, a reliable deepfake detection\nsystem must remain impartial to forgery types, appearance, and quality for\nguaranteed generalizable detection performance. Despite various attempts to\nenhance cross-dataset generalization, the problem remains challenging,\nparticularly when testing against common post-processing perturbations, such as\nvideo compression or blur. Hence, this study introduces a deepfake detection\nframework, leveraging a self-supervised pre-training model that delivers\nexceptional generalization ability, withstanding common corruptions and\nenabling feature explainability. The framework comprises three key components:\na feature extractor based on vision Transformer architecture that is\npre-trained via self-supervised contrastive learning methodology, a graph\nconvolution network coupled with a Transformer discriminator, and a graph\nTransformer relevancy map that provides a better understanding of manipulated\nregions and further explains the model's decision. To assess the effectiveness\nof the proposed framework, several challenging experiments are conducted,\nincluding in-data distribution performance, cross-dataset, cross-manipulation\ngeneralization, and robustness against common post-production perturbations.\nThe results achieved demonstrate the remarkable effectiveness of the proposed\ndeepfake detection framework, surpassing the current state-of-the-art\napproaches.\n","authors":["Aminollah Khormali","Jiann-Shiun Yuan"],"pdf_url":"https://arxiv.org/pdf/2307.15019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15016v1","updated":"2023-07-27T17:19:32Z","published":"2023-07-27T17:19:32Z","title":"How Good is Google Bard's Visual Understanding? An Empirical Study on\n  Open Challenges","summary":"  Google's Bard has emerged as a formidable competitor to OpenAI's ChatGPT in\nthe field of conversational AI. Notably, Bard has recently been updated to\nhandle visual inputs alongside text prompts during conversations. Given Bard's\nimpressive track record in handling textual inputs, we explore its capabilities\nin understanding and interpreting visual data (images) conditioned by text\nquestions. This exploration holds the potential to unveil new insights and\nchallenges for Bard and other forthcoming multi-modal Generative models,\nespecially in addressing complex computer vision problems that demand accurate\nvisual and language understanding. Specifically, in this study, we focus on 15\ndiverse task scenarios encompassing regular, camouflaged, medical, under-water\nand remote sensing data to comprehensively evaluate Bard's performance. Our\nprimary finding indicates that Bard still struggles in these vision scenarios,\nhighlighting the significant gap in vision-based understanding that needs to be\nbridged in future developments. We expect that this empirical study will prove\nvaluable in advancing future models, leading to enhanced capabilities in\ncomprehending and interpreting fine-grained visual data. Our project is\nreleased on https://github.com/htqin/GoogleBard-VisUnderstand\n","authors":["Haotong Qin","Ge-Peng Ji","Salman Khan","Deng-Ping Fan","Fahad Shahbaz Khan","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2307.15016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15007v1","updated":"2023-07-27T17:06:02Z","published":"2023-07-27T17:06:02Z","title":"Verifiable Feature Attributions: A Bridge between Post Hoc\n  Explainability and Inherent Interpretability","summary":"  With the increased deployment of machine learning models in various\nreal-world applications, researchers and practitioners alike have emphasized\nthe need for explanations of model behaviour. To this end, two broad strategies\nhave been outlined in prior literature to explain models. Post hoc explanation\nmethods explain the behaviour of complex black-box models by highlighting\nfeatures that are critical to model predictions; however, prior work has shown\nthat these explanations may not be faithful, and even more concerning is our\ninability to verify them. Specifically, it is nontrivial to evaluate if a given\nattribution is correct with respect to the underlying model. Inherently\ninterpretable models, on the other hand, circumvent these issues by explicitly\nencoding explanations into model architecture, meaning their explanations are\nnaturally faithful and verifiable, but they often exhibit poor predictive\nperformance due to their limited expressive power. In this work, we aim to\nbridge the gap between the aforementioned strategies by proposing Verifiability\nTuning (VerT), a method that transforms black-box models into models that\nnaturally yield faithful and verifiable feature attributions. We begin by\nintroducing a formal theoretical framework to understand verifiability and show\nthat attributions produced by standard models cannot be verified. We then\nleverage this framework to propose a method to build verifiable models and\nfeature attributions out of fully trained black-box models. Finally, we perform\nextensive experiments on semi-synthetic and real-world datasets, and show that\nVerT produces models that (1) yield explanations that are correct and\nverifiable and (2) are faithful to the original black-box models they are meant\nto explain.\n","authors":["Usha Bhalla","Suraj Srinivas","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2307.15007v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14981v1","updated":"2023-07-27T16:19:12Z","published":"2023-07-27T16:19:12Z","title":"MapNeRF: Incorporating Map Priors into Neural Radiance Fields for\n  Driving View Simulation","summary":"  Simulating camera sensors is a crucial task in autonomous driving. Although\nneural radiance fields are exceptional at synthesizing photorealistic views in\ndriving simulations, they still fail in generating extrapolated views. This\npaper proposes to incorporate map priors into neural radiance fields to\nsynthesize out-of-trajectory driving views with semantic road consistency. The\nkey insight is that map information can be utilized as a prior to guide the\ntraining of the radiance fields with uncertainty. Specifically, we utilize the\ncoarse ground surface as uncertain information to supervise the density field\nand warp depth with uncertainty from unknown camera poses to ensure multi-view\nconsistency. Experimental results demonstrate that our approach can produce\nsemantic consistency in deviated views for vehicle camera simulation.\n","authors":["Chenming Wu","Jiadai Sun","Zhelun Shen","Liangjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.14981v1.pdf","comment":"Accepted by IEEE/RSJ International Conference on Intelligent Robots\n  and Systems (IROS) 2023"},{"id":"http://arxiv.org/abs/2305.13849v2","updated":"2023-07-27T16:09:57Z","published":"2023-05-23T09:18:47Z","title":"Gaussian Latent Representations for Uncertainty Estimation using\n  Mahalanobis Distance in Deep Classifiers","summary":"  Recent works show that the data distribution in a network's latent space is\nuseful for estimating classification uncertainty and detecting\nOut-of-distribution (OOD) samples. To obtain a well-regularized latent space\nthat is conducive for uncertainty estimation, existing methods bring in\nsignificant changes to model architectures and training procedures. In this\npaper, we present a lightweight, fast, and high-performance regularization\nmethod for Mahalanobis distance-based uncertainty prediction, and that requires\nminimal changes to the network's architecture. To derive Gaussian latent\nrepresentation favourable for Mahalanobis Distance calculation, we introduce a\nself-supervised representation learning method that separates in-class\nrepresentations into multiple Gaussians. Classes with non-Gaussian\nrepresentations are automatically identified and dynamically clustered into\nmultiple new classes that are approximately Gaussian. Evaluation on standard\nOOD benchmarks shows that our method achieves state-of-the-art results on OOD\ndetection with minimal inference time, and is very competitive on predictive\nprobability calibration. Finally, we show the applicability of our method to a\nreal-life computer vision use case on microorganism classification.\n","authors":["Aishwarya Venkataramanan","Assia Benbihi","Martin Laviale","Cedric Pradalier"],"pdf_url":"https://arxiv.org/pdf/2305.13849v2.pdf","comment":"24 pages including supplementary material"},{"id":"http://arxiv.org/abs/2307.14971v1","updated":"2023-07-27T16:07:03Z","published":"2023-07-27T16:07:03Z","title":"Take-A-Photo: 3D-to-2D Generative Pre-training of Point Cloud Models","summary":"  With the overwhelming trend of mask image modeling led by MAE, generative\npre-training has shown a remarkable potential to boost the performance of\nfundamental models in 2D vision. However, in 3D vision, the over-reliance on\nTransformer-based backbones and the unordered nature of point clouds have\nrestricted the further development of generative pre-training. In this paper,\nwe propose a novel 3D-to-2D generative pre-training method that is adaptable to\nany point cloud model. We propose to generate view images from different\ninstructed poses via the cross-attention mechanism as the pre-training scheme.\nGenerating view images has more precise supervision than its point cloud\ncounterpart, thus assisting 3D backbones to have a finer comprehension of the\ngeometrical structure and stereoscopic relations of the point cloud.\nExperimental results have proved the superiority of our proposed 3D-to-2D\ngenerative pre-training over previous pre-training methods. Our method is also\neffective in boosting the performance of architecture-oriented approaches,\nachieving state-of-the-art performance when fine-tuning on ScanObjectNN\nclassification and ShapeNetPart segmentation tasks. Code is available at\nhttps://github.com/wangzy22/TAP.\n","authors":["Ziyi Wang","Xumin Yu","Yongming Rao","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2307.14971v1.pdf","comment":"Accepted to ICCV 2023, project page: https://tap.ivg-research.xyz"},{"id":"http://arxiv.org/abs/2307.14959v1","updated":"2023-07-27T15:52:18Z","published":"2023-07-27T15:52:18Z","title":"Federated Model Aggregation via Self-Supervised Priors for Highly\n  Imbalanced Medical Image Classification","summary":"  In the medical field, federated learning commonly deals with highly\nimbalanced datasets, including skin lesions and gastrointestinal images.\nExisting federated methods under highly imbalanced datasets primarily focus on\noptimizing a global model without incorporating the intra-class variations that\ncan arise in medical imaging due to different populations, findings, and\nscanners. In this paper, we study the inter-client intra-class variations with\npublicly available self-supervised auxiliary networks. Specifically, we find\nthat employing a shared auxiliary pre-trained model, like MoCo-V2, locally on\nevery client yields consistent divergence measurements. Based on these\nfindings, we derive a dynamic balanced model aggregation via self-supervised\npriors (MAS) to guide the global model optimization. Fed-MAS can be utilized\nwith different local learning methods for effective model aggregation toward a\nhighly robust and unbiased global model. Our code is available at\n\\url{https://github.com/xmed-lab/Fed-MAS}.\n","authors":["Marawan Elbatel","Hualiang Wang","Robert Martí","Huazhu Fu","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2307.14959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14918v1","updated":"2023-07-27T15:00:54Z","published":"2023-07-27T15:00:54Z","title":"GET3D--: Learning GET3D from Unconstrained Image Collections","summary":"  The demand for efficient 3D model generation techniques has grown\nexponentially, as manual creation of 3D models is time-consuming and requires\nspecialized expertise. While generative models have shown potential in creating\n3D textured shapes from 2D images, their applicability in 3D industries is\nlimited due to the lack of a well-defined camera distribution in real-world\nscenarios, resulting in low-quality shapes. To overcome this limitation, we\npropose GET3D--, the first method that directly generates textured 3D shapes\nfrom 2D images with unknown pose and scale. GET3D-- comprises a 3D shape\ngenerator and a learnable camera sampler that captures the 6D external changes\non the camera. In addition, We propose a novel training schedule to stably\noptimize both the shape generator and camera sampler in a unified framework. By\ncontrolling external variations using the learnable camera sampler, our method\ncan generate aligned shapes with clear textures. Extensive experiments\ndemonstrate the efficacy of GET3D--, which precisely fits the 6D camera pose\ndistribution and generates high-quality shapes on both synthetic and realistic\nunconstrained datasets.\n","authors":["Fanghua Yu","Xintao Wang","Zheyuan Li","Yan-Pei Cao","Ying Shan","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2307.14918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14917v1","updated":"2023-07-27T15:00:31Z","published":"2023-07-27T15:00:31Z","title":"NSA: Naturalistic Support Artifact to Boost Network Confidence","summary":"  Visual AI systems are vulnerable to natural and synthetic physical corruption\nin the real-world. Such corruption often arises unexpectedly and alters the\nmodel's performance. In recent years, the primary focus has been on adversarial\nattacks. However, natural corruptions (e.g., snow, fog, dust) are an\nomnipresent threat to visual AI systems and should be considered equally\nimportant. Many existing works propose interesting solutions to train robust\nmodels against natural corruption. These works either leverage image\naugmentations, which come with the additional cost of model training, or place\nsuspicious patches in the scene to design unadversarial examples. In this work,\nwe propose the idea of naturalistic support artifacts (NSA) for robust\nprediction. The NSAs are shown to be beneficial in scenarios where model\nparameters are inaccessible and adding artifacts in the scene is feasible. The\nNSAs are natural looking objects generated through artifact training using\nDC-GAN to have high visual fidelity in the scene. We test against natural\ncorruptions on the Imagenette dataset and observe the improvement in prediction\nconfidence score by four times. We also demonstrate NSA's capability to\nincrease adversarial accuracy by 8\\% on average. Lastly, we qualitatively\nanalyze NSAs using saliency maps to understand how they help improve prediction\nconfidence.\n","authors":["Abhijith Sharma","Phil Munz","Apurva Narayan"],"pdf_url":"https://arxiv.org/pdf/2307.14917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14907v1","updated":"2023-07-27T14:48:02Z","published":"2023-07-27T14:48:02Z","title":"Weakly Supervised AI for Efficient Analysis of 3D Pathology Samples","summary":"  Human tissue and its constituent cells form a microenvironment that is\nfundamentally three-dimensional (3D). However, the standard-of-care in\npathologic diagnosis involves selecting a few two-dimensional (2D) sections for\nmicroscopic evaluation, risking sampling bias and misdiagnosis. Diverse methods\nfor capturing 3D tissue morphologies have been developed, but they have yet had\nlittle translation to clinical practice; manual and computational evaluations\nof such large 3D data have so far been impractical and/or unable to provide\npatient-level clinical insights. Here we present Modality-Agnostic Multiple\ninstance learning for volumetric Block Analysis (MAMBA), a deep-learning-based\nplatform for processing 3D tissue images from diverse imaging modalities and\npredicting patient outcomes. Archived prostate cancer specimens were imaged\nwith open-top light-sheet microscopy or microcomputed tomography and the\nresulting 3D datasets were used to train risk-stratification networks based on\n5-year biochemical recurrence outcomes via MAMBA. With the 3D block-based\napproach, MAMBA achieves an area under the receiver operating characteristic\ncurve (AUC) of 0.86 and 0.74, superior to 2D traditional single-slice-based\nprognostication (AUC of 0.79 and 0.57), suggesting superior prognostication\nwith 3D morphological features. Further analyses reveal that the incorporation\nof greater tissue volume improves prognostic performance and mitigates risk\nprediction variability from sampling bias, suggesting the value of capturing\nlarger extents of heterogeneous 3D morphology. With the rapid growth and\nadoption of 3D spatial biology and pathology techniques by researchers and\nclinicians, MAMBA provides a general and efficient framework for 3D weakly\nsupervised learning for clinical decision support and can help to reveal novel\n3D morphological biomarkers for prognosis and therapeutic response.\n","authors":["Andrew H. Song","Mane Williams","Drew F. K. Williamson","Guillaume Jaume","Andrew Zhang","Bowen Chen","Robert Serafin","Jonathan T. C. Liu","Alex Baras","Anil V. Parwani","Faisal Mahmood"],"pdf_url":"https://arxiv.org/pdf/2307.14907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14901v1","updated":"2023-07-27T14:44:56Z","published":"2023-07-27T14:44:56Z","title":"Text-guided Foundation Model Adaptation for Pathological Image\n  Classification","summary":"  The recent surge of foundation models in computer vision and natural language\nprocessing opens up perspectives in utilizing multi-modal clinical data to\ntrain large models with strong generalizability. Yet pathological image\ndatasets often lack biomedical text annotation and enrichment. Guiding\ndata-efficient image diagnosis from the use of biomedical text knowledge\nbecomes a substantial interest. In this paper, we propose to Connect Image and\nText Embeddings (CITE) to enhance pathological image classification. CITE\ninjects text insights gained from language models pre-trained with a broad\nrange of biomedical texts, leading to adapt foundation models towards\npathological image understanding. Through extensive experiments on the\nPatchGastric stomach tumor pathological image dataset, we demonstrate that CITE\nachieves leading performance compared with various baselines especially when\ntraining data is scarce. CITE offers insights into leveraging in-domain text\nknowledge to reinforce data-efficient pathological image classification. Code\nis available at https://github.com/Yunkun-Zhang/CITE.\n","authors":["Yunkun Zhang","Jin Gao","Mu Zhou","Xiaosong Wang","Yu Qiao","Shaoting Zhang","Dequan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14901v1.pdf","comment":"Accepted to MICCAI2023"},{"id":"http://arxiv.org/abs/2307.14897v1","updated":"2023-07-27T14:38:32Z","published":"2023-07-27T14:38:32Z","title":"Mixture of Self-Supervised Learning","summary":"  Self-supervised learning is popular method because of its ability to learn\nfeatures in images without using its labels and is able to overcome limited\nlabeled datasets used in supervised learning. Self-supervised learning works by\nusing a pretext task which will be trained on the model before being applied to\na specific task. There are some examples of pretext tasks used in\nself-supervised learning in the field of image recognition, namely rotation\nprediction, solving jigsaw puzzles, and predicting relative positions on image.\nPrevious studies have only used one type of transformation as a pretext task.\nThis raises the question of how it affects if more than one pretext task is\nused and to use a gating network to combine all pretext tasks. Therefore, we\npropose the Gated Self-Supervised Learning method to improve image\nclassification which use more than one transformation as pretext task and uses\nthe Mixture of Expert architecture as a gating network in combining each\npretext task so that the model automatically can study and focus more on the\nmost useful augmentations for classification. We test performance of the\nproposed method in several scenarios, namely CIFAR imbalance dataset\nclassification, adversarial perturbations, Tiny-Imagenet dataset\nclassification, and semi-supervised learning. Moreover, there are Grad-CAM and\nT-SNE analysis that are used to see the proposed method for identifying\nimportant features that influence image classification and representing data\nfor each class and separating different classes properly. Our code is in\nhttps://github.com/aristorenaldo/G-SSL\n","authors":["Aristo Renaldo Ruslim","Novanto Yudistira","Budi Darma Setiawan"],"pdf_url":"https://arxiv.org/pdf/2307.14897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14889v1","updated":"2023-07-27T14:28:50Z","published":"2023-07-27T14:28:50Z","title":"Weakly Supervised Multi-Modal 3D Human Body Pose Estimation for\n  Autonomous Driving","summary":"  Accurate 3D human pose estimation (3D HPE) is crucial for enabling autonomous\nvehicles (AVs) to make informed decisions and respond proactively in critical\nroad scenarios. Promising results of 3D HPE have been gained in several domains\nsuch as human-computer interaction, robotics, sports and medical analytics,\noften based on data collected in well-controlled laboratory environments.\nNevertheless, the transfer of 3D HPE methods to AVs has received limited\nresearch attention, due to the challenges posed by obtaining accurate 3D pose\nannotations and the limited suitability of data from other domains.\n  We present a simple yet efficient weakly supervised approach for 3D HPE in\nthe AV context by employing a high-level sensor fusion between camera and LiDAR\ndata. The weakly supervised setting enables training on the target datasets\nwithout any 2D/3D keypoint labels by using an off-the-shelf 2D joint extractor\nand pseudo labels generated from LiDAR to image projections. Our approach\noutperforms state-of-the-art results by up to $\\sim$ 13% on the Waymo Open\nDataset in the weakly supervised setting and achieves state-of-the-art results\nin the supervised setting.\n","authors":["Peter Bauer","Arij Bouazizi","Ulrich Kressel","Fabian B. Flohr"],"pdf_url":"https://arxiv.org/pdf/2307.14889v1.pdf","comment":"7 pages, Accepted at IEEE-IV 2023"},{"id":"http://arxiv.org/abs/2307.14051v2","updated":"2023-07-27T14:24:10Z","published":"2023-07-26T09:04:27Z","title":"3D Semantic Subspace Traverser: Empowering 3D Generative Model with\n  Shape Editing Capability","summary":"  Shape generation is the practice of producing 3D shapes as various\nrepresentations for 3D content creation. Previous studies on 3D shape\ngeneration have focused on shape quality and structure, without or less\nconsidering the importance of semantic information. Consequently, such\ngenerative models often fail to preserve the semantic consistency of shape\nstructure or enable manipulation of the semantic attributes of shapes during\ngeneration. In this paper, we proposed a novel semantic generative model named\n3D Semantic Subspace Traverser that utilizes semantic attributes for\ncategory-specific 3D shape generation and editing. Our method utilizes implicit\nfunctions as the 3D shape representation and combines a novel latent-space GAN\nwith a linear subspace model to discover semantic dimensions in the local\nlatent space of 3D shapes. Each dimension of the subspace corresponds to a\nparticular semantic attribute, and we can edit the attributes of generated\nshapes by traversing the coefficients of those dimensions. Experimental results\ndemonstrate that our method can produce plausible shapes with complex\nstructures and enable the editing of semantic attributes. The code and trained\nmodels are available at\nhttps://github.com/TrepangCat/3D_Semantic_Subspace_Traverser\n","authors":["Ruowei Wang","Yu Liu","Pei Su","Jianwei Zhang","Qijun Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.14051v2.pdf","comment":"Published in ICCV 2023. Code:\n  https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser"},{"id":"http://arxiv.org/abs/2306.02082v2","updated":"2023-07-27T13:59:10Z","published":"2023-06-03T11:07:56Z","title":"Unsupervised Low Light Image Enhancement Using SNR-Aware Swin\n  Transformer","summary":"  Image captured under low-light conditions presents unpleasing artifacts,\nwhich debilitate the performance of feature extraction for many upstream visual\ntasks. Low-light image enhancement aims at improving brightness and contrast,\nand further reducing noise that corrupts the visual quality. Recently, many\nimage restoration methods based on Swin Transformer have been proposed and\nachieve impressive performance. However, on one hand, trivially employing Swin\nTransformer for low-light image enhancement would expose some artifacts,\nincluding over-exposure, brightness imbalance and noise corruption, etc. On the\nother hand, it is impractical to capture image pairs of low-light images and\ncorresponding ground-truth, i.e. well-exposed image in same visual scene. In\nthis paper, we propose a dual-branch network based on Swin Transformer, guided\nby a signal-to-noise ratio prior map which provides the spatial-varying\ninformation for low-light image enhancement. Moreover, we leverage unsupervised\nlearning to construct the optimization objective based on Retinex model, to\nguide the training of proposed network. Experimental results demonstrate that\nthe proposed model is competitive with the baseline models.\n","authors":["Zhijian Luo","Jiahui Tang","Yueen Hou","Zihan Huang","Yanzeng Gao"],"pdf_url":"https://arxiv.org/pdf/2306.02082v2.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.14866v1","updated":"2023-07-27T13:52:42Z","published":"2023-07-27T13:52:42Z","title":"Sample Less, Learn More: Efficient Action Recognition via Frame Feature\n  Restoration","summary":"  Training an effective video action recognition model poses significant\ncomputational challenges, particularly under limited resource budgets. Current\nmethods primarily aim to either reduce model size or utilize pre-trained\nmodels, limiting their adaptability to various backbone architectures. This\npaper investigates the issue of over-sampled frames, a prevalent problem in\nmany approaches yet it has received relatively little attention. Despite the\nuse of fewer frames being a potential solution, this approach often results in\na substantial decline in performance. To address this issue, we propose a novel\nmethod to restore the intermediate features for two sparsely sampled and\nadjacent video frames. This feature restoration technique brings a negligible\nincrease in computational requirements compared to resource-intensive image\nencoders, such as ViT. To evaluate the effectiveness of our method, we conduct\nextensive experiments on four public datasets, including Kinetics-400,\nActivityNet, UCF-101, and HMDB-51. With the integration of our method, the\nefficiency of three commonly used baselines has been improved by over 50%, with\na mere 0.5% reduction in recognition accuracy. In addition, our method also\nsurprisingly helps improve the generalization ability of the models under\nzero-shot settings.\n","authors":["Harry Cheng","Yangyang Guo","Liqiang Nie","Zhiyong Cheng","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2307.14866v1.pdf","comment":"13 pages. Code and pretrained weight will be released at\n  https://github.com/xaCheng1996/SLLM"},{"id":"http://arxiv.org/abs/2307.14864v1","updated":"2023-07-27T13:50:27Z","published":"2023-07-27T13:50:27Z","title":"A full-resolution training framework for Sentinel-2 image fusion","summary":"  This work presents a new unsupervised framework for training deep learning\nmodels for super-resolution of Sentinel-2 images by fusion of its 10-m and 20-m\nbands. The proposed scheme avoids the resolution downgrade process needed to\ngenerate training data in the supervised case. On the other hand, a proper loss\nthat accounts for cycle-consistency between the network prediction and the\ninput components to be fused is proposed. Despite its unsupervised nature, in\nour preliminary experiments the proposed scheme has shown promising results in\ncomparison to the supervised approach. Besides, by construction of the proposed\nloss, the resulting trained network can be ascribed to the class of\nmulti-resolution analysis methods.\n","authors":["Matteo Ciotola","Mario Ragosta","Giovanni Poggi","Giuseppe Scarpa"],"pdf_url":"https://arxiv.org/pdf/2307.14864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14863v1","updated":"2023-07-27T13:49:27Z","published":"2023-07-27T13:49:27Z","title":"IML-ViT: Image Manipulation Localization by Vision Transformer","summary":"  Advanced image tampering techniques are increasingly challenging the\ntrustworthiness of multimedia, leading to the development of Image Manipulation\nLocalization (IML). But what makes a good IML model? The answer lies in the way\nto capture artifacts. Exploiting artifacts requires the model to extract\nnon-semantic discrepancies between the manipulated and authentic regions, which\nneeds to compare differences between these two areas explicitly. With the\nself-attention mechanism, naturally, the Transformer is the best candidate.\nBesides, artifacts are sensitive to image resolution, amplified under\nmulti-scale features, and massive at the manipulation border. Therefore, we\nformulate the answer to the former question as building a ViT with\nhigh-resolution capacity, multi-scale feature extraction capability, and\nmanipulation edge supervision. We term this simple but effective ViT paradigm\nas the IML-ViT, which has great potential to become a new benchmark for IML.\nExtensive experiments on five benchmark datasets verified our model outperforms\nthe state-of-the-art manipulation localization methods. Code and models are\navailable at \\url{https://github.com/SunnyHaze/IML-ViT}\n","authors":["Xiaochen Ma","Bo Du","Xianggen Liu","Ahmed Y. Al Hammadi","Jizhe Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.14863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03753v2","updated":"2023-07-27T13:45:49Z","published":"2023-06-06T15:13:58Z","title":"AI Art Curation: Re-imagining the city of Helsinki in occasion of its\n  Biennial","summary":"  Art curatorial practice is characterized by the presentation of an art\ncollection in a knowledgeable way. Machine processes are characterized by their\ncapacity to manage and analyze large amounts of data. This paper envisages AI\ncuration and audience interaction to explore the implications of contemporary\nmachine learning models for the curatorial world. This project was developed\nfor the occasion of the 2023 Helsinki Art Biennial, entitled New Directions May\nEmerge. We use the Helsinki Art Museum (HAM) collection to re-imagine the city\nof Helsinki through the lens of machine perception. We use visual-textual\nmodels to place indoor artworks in public spaces, assigning fictional\ncoordinates based on similarity scores. We transform the space that each\nartwork inhabits in the city by generating synthetic 360 art panoramas. We\nguide the generation estimating depth values from 360 panoramas at each artwork\nlocation, and machine-generated prompts of the artworks. The result of this\nproject is an AI curation that places the artworks in their imagined physical\nspace, blurring the lines of artwork, context, and machine perception. The work\nis virtually presented as a web-based installation on this link\nhttp://newlyformedcity.net/, where users can navigate an alternative version of\nthe city while exploring and interacting with its cultural heritage at scale.\n","authors":["Ludovica Schaerf","Pepe Ballesteros","Valentine Bernasconi","Iacopo Neri","Dario Negueruela del Castillo"],"pdf_url":"https://arxiv.org/pdf/2306.03753v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14859v1","updated":"2023-07-27T13:44:51Z","published":"2023-07-27T13:44:51Z","title":"Comparative Evaluation of Digital and Analog Chest Radiographs to\n  Identify Tuberculosis using Deep Learning Model","summary":"  Purpose:Chest X-ray (CXR) is an essential tool and one of the most prescribed\nimaging to detect pulmonary abnormalities, with a yearly estimate of over 2\nbillion imaging performed worldwide. However, the accurate and timely diagnosis\nof TB remains an unmet goal. The prevalence of TB is highest in\nlow-middle-income countries, and the requirement of a portable, automated, and\nreliable solution is required. In this study, we compared the performance of\nDL-based devices on digital and analog CXR. The evaluated DL-based device can\nbe used in resource-constraint settings. Methods: A total of 10,000 CXR\nDICOMs(.dcm) and printed photos of the films acquired with three different\ncellular phones - Samsung S8, iPhone 8, and iPhone XS along with their\nradiological report were retrospectively collected from various sites across\nIndia from April 2020 to March 2021. Results: 10,000 chest X-rays were utilized\nto evaluate the DL-based device in identifying radiological signs of TB. The\nAUC of qXR for detecting signs of tuberculosis on the original DICOMs dataset\nwas 0.928 with a sensitivity of 0.841 at a specificity of 0.806. At an optimal\nthreshold, the difference in the AUC of three cellular smartphones with the\noriginal DICOMs is 0.024 (2.55%), 0.048 (5.10%), and 0.038 (1.91%). The minimum\ndifference demonstrates the robustness of the DL-based device in identifying\nradiological signs of TB in both digital and analog CXR.\n","authors":["Subhankar Chattoraj","Bhargava Reddy","Manoj Tadepalli","Preetham Putha"],"pdf_url":"https://arxiv.org/pdf/2307.14859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14825v1","updated":"2023-07-27T13:01:49Z","published":"2023-07-27T13:01:49Z","title":"Simplified Concrete Dropout -- Improving the Generation of Attribution\n  Masks for Fine-grained Classification","summary":"  Fine-grained classification is a particular case of a classification problem,\naiming to classify objects that share the visual appearance and can only be\ndistinguished by subtle differences. Fine-grained classification models are\noften deployed to determine animal species or individuals in automated animal\nmonitoring systems. Precise visual explanations of the model's decision are\ncrucial to analyze systematic errors. Attention- or gradient-based methods are\ncommonly used to identify regions in the image that contribute the most to the\nclassification decision. These methods deliver either too coarse or too noisy\nexplanations, unsuitable for identifying subtle visual differences reliably.\nHowever, perturbation-based methods can precisely identify pixels causally\nresponsible for the classification result. Fill-in of the dropout (FIDO)\nalgorithm is one of those methods. It utilizes the concrete dropout (CD) to\nsample a set of attribution masks and updates the sampling parameters based on\nthe output of the classification model. A known problem of the algorithm is a\nhigh variance in the gradient estimates, which the authors have mitigated until\nnow by mini-batch updates of the sampling parameters. This paper presents a\nsolution to circumvent these computational instabilities by simplifying the CD\nsampling and reducing reliance on large mini-batch sizes. First, it allows\nestimating the parameters with smaller mini-batch sizes without losing the\nquality of the estimates but with a reduced computational effort. Furthermore,\nour solution produces finer and more coherent attribution masks. Finally, we\nuse the resulting attribution masks to improve the classification performance\nof a trained model without additional fine-tuning of the model.\n","authors":["Dimitri Korsch","Maha Shadaydeh","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2307.14825v1.pdf","comment":"Accepted at the German Conference on Pattern Recognition 2023 (GCPR\n  2023)"},{"id":"http://arxiv.org/abs/2307.14823v1","updated":"2023-07-27T13:00:21Z","published":"2023-07-27T13:00:21Z","title":"Fading memory as inductive bias in residual recurrent networks","summary":"  Residual connections have been proposed as architecture-based inductive bias\nto mitigate the problem of exploding and vanishing gradients and increase task\nperformance in both feed-forward and recurrent networks (RNNs) when trained\nwith the backpropagation algorithm. Yet, little is known about how residual\nconnections in RNNs influence their dynamics and fading memory properties.\nHere, we introduce weakly coupled residual recurrent networks (WCRNNs) in which\nresidual connections result in well-defined Lyapunov exponents and allow for\nstudying properties of fading memory. We investigate how the residual\nconnections of WCRNNs influence their performance, network dynamics, and memory\nproperties on a set of benchmark tasks. We show that several distinct forms of\nresidual connections yield effective inductive biases that result in increased\nnetwork expressivity. In particular, residual connections that (i) result in\nnetwork dynamics at the proximity of the edge of chaos, (ii) allow networks to\ncapitalize on characteristic spectral properties of the data, and (iii) result\nin heterogeneous memory properties are shown to increase practical\nexpressivity. In addition, we demonstrate how our results can be extended to\nnon-linear residuals and introduce a weakly coupled residual initialization\nscheme that can be used for Elman RNNs\n","authors":["Igor Dubinin","Felix Effenberger"],"pdf_url":"https://arxiv.org/pdf/2307.14823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06648v6","updated":"2023-07-27T12:57:58Z","published":"2023-04-13T16:17:50Z","title":"DiffFit: Unlocking Transferability of Large Diffusion Models via Simple\n  Parameter-Efficient Fine-Tuning","summary":"  Diffusion models have proven to be highly effective in generating\nhigh-quality images. However, adapting large pre-trained diffusion models to\nnew domains remains an open challenge, which is critical for real-world\napplications. This paper proposes DiffFit, a parameter-efficient strategy to\nfine-tune large pre-trained diffusion models that enable fast adaptation to new\ndomains. DiffFit is embarrassingly simple that only fine-tunes the bias term\nand newly-added scaling factors in specific layers, yet resulting in\nsignificant training speed-up and reduced model storage costs. Compared with\nfull fine-tuning, DiffFit achieves 2$\\times$ training speed-up and only needs\nto store approximately 0.12\\% of the total model parameters. Intuitive\ntheoretical analysis has been provided to justify the efficacy of scaling\nfactors on fast adaptation. On 8 downstream datasets, DiffFit achieves superior\nor competitive performances compared to the full fine-tuning while being more\nefficient. Remarkably, we show that DiffFit can adapt a pre-trained\nlow-resolution generative model to a high-resolution one by adding minimal\ncost. Among diffusion-based methods, DiffFit sets a new state-of-the-art FID of\n3.02 on ImageNet 512$\\times$512 benchmark by fine-tuning only 25 epochs from a\npublic pre-trained ImageNet 256$\\times$256 checkpoint while being 30$\\times$\nmore training efficient than the closest competitor.\n","authors":["Enze Xie","Lewei Yao","Han Shi","Zhili Liu","Daquan Zhou","Zhaoqiang Liu","Jiawei Li","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2304.06648v6.pdf","comment":"Tech Report"},{"id":"http://arxiv.org/abs/2307.13717v2","updated":"2023-07-27T11:33:42Z","published":"2023-07-25T17:29:32Z","title":"A Comprehensive Analysis on the Leakage of Fuzzy Matchers","summary":"  This paper provides a comprehensive analysis of information leakage during\ndistance evaluation, with an emphasis on threshold-based obfuscated distance\n(i.e., Fuzzy Matcher). Leakage can occur due to a malware infection or the use\nof a weakly privacy-preserving matcher, exemplified by side channel attacks or\npartially obfuscated designs. We provide an exhaustive catalog of information\nleakage scenarios as well as their impacts on the security concerning data\nprivacy. Each of the scenarios leads to generic attacks whose impacts are\nexpressed in terms of computational costs, hence allowing the establishment of\nupper bounds on the security level.\n","authors":["Axel Durbet","Paul-Marie Grollemund","Kevin Thiry-Atighehchi"],"pdf_url":"https://arxiv.org/pdf/2307.13717v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14786v1","updated":"2023-07-27T11:28:33Z","published":"2023-07-27T11:28:33Z","title":"Towards Deeply Unified Depth-aware Panoptic Segmentation with\n  Bi-directional Guidance Learning","summary":"  Depth-aware panoptic segmentation is an emerging topic in computer vision\nwhich combines semantic and geometric understanding for more robust scene\ninterpretation. Recent works pursue unified frameworks to tackle this challenge\nbut mostly still treat it as two individual learning tasks, which limits their\npotential for exploring cross-domain information. We propose a deeply unified\nframework for depth-aware panoptic segmentation, which performs joint\nsegmentation and depth estimation both in a per-segment manner with identical\nobject queries. To narrow the gap between the two tasks, we further design a\ngeometric query enhancement method, which is able to integrate scene geometry\ninto object queries using latent representations. In addition, we propose a\nbi-directional guidance learning approach to facilitate cross-task feature\nlearning by taking advantage of their mutual relations. Our method sets the new\nstate of the art for depth-aware panoptic segmentation on both Cityscapes-DVPS\nand SemKITTI-DVPS datasets. Moreover, our guidance learning approach is shown\nto deliver performance improvement even under incomplete supervision labels.\n","authors":["Junwen He","Yifan Wang","Lijun Wang","Huchuan Lu","Jun-Yan He","Jin-Peng Lan","Bin Luo","Yifeng Geng","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2307.14786v1.pdf","comment":"to be published in ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10853v3","updated":"2023-07-27T11:24:26Z","published":"2023-07-20T13:16:10Z","title":"Exploring Effective Priors and Efficient Models for Weakly-Supervised\n  Change Detection","summary":"  Weakly-supervised change detection (WSCD) aims to detect pixel-level changes\nwith only image-level annotations. Owing to its label efficiency, WSCD is\ndrawing increasing attention recently. However, current WSCD methods often\nencounter the challenge of change missing and fabricating, i.e., the\ninconsistency between image-level annotations and pixel-level predictions.\nSpecifically, change missing refer to the situation that the WSCD model fails\nto predict any changed pixels, even though the image-level label indicates\nchanged, and vice versa for change fabricating. To address this challenge, in\nthis work, we leverage global-scale and local-scale priors in WSCD and propose\ntwo components: a Dilated Prior (DP) decoder and a Label Gated (LG) constraint.\nThe DP decoder decodes samples with the changed image-level label, skips\nsamples with the unchanged label, and replaces them with an all-unchanged\npixel-level label. The LG constraint is derived from the correspondence between\nchanged representations and image-level labels, penalizing the model when it\nmispredicts the change status. Additionally, we develop TransWCD, a simple yet\npowerful transformer-based model, showcasing the potential of weakly-supervised\nlearning in change detection. By integrating the DP decoder and LG constraint\ninto TransWCD, we form TransWCD-DL. Our proposed TransWCD and TransWCD-DL\nachieve significant +6.33% and +9.55% F1 score improvements over the\nstate-of-the-art methods on the WHU-CD dataset, respectively. Some performance\nmetrics even exceed several fully-supervised change detection (FSCD)\ncompetitors. Code will be available at\nhttps://github.com/zhenghuizhao/TransWCD.\n","authors":["Zhenghui Zhao","Lixiang Ru","Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2307.10853v3.pdf","comment":"Submitted to IEEE Transactions on Pattern Analysis and Machine\n  Intelligence"},{"id":"http://arxiv.org/abs/2307.14781v1","updated":"2023-07-27T11:21:14Z","published":"2023-07-27T11:21:14Z","title":"Contrastive Knowledge Amalgamation for Unsupervised Image Classification","summary":"  Knowledge amalgamation (KA) aims to learn a compact student model to handle\nthe joint objective from multiple teacher models that are are specialized for\ntheir own tasks respectively. Current methods focus on coarsely aligning\nteachers and students in the common representation space, making it difficult\nfor the student to learn the proper decision boundaries from a set of\nheterogeneous teachers. Besides, the KL divergence in previous works only\nminimizes the probability distribution difference between teachers and the\nstudent, ignoring the intrinsic characteristics of teachers. Therefore, we\npropose a novel Contrastive Knowledge Amalgamation (CKA) framework, which\nintroduces contrastive losses and an alignment loss to achieve intra-class\ncohesion and inter-class separation.Contrastive losses intra- and inter- models\nare designed to widen the distance between representations of different\nclasses. The alignment loss is introduced to minimize the sample-level\ndistribution differences of teacher-student models in the common representation\nspace.Furthermore, the student learns heterogeneous unsupervised classification\ntasks through soft targets efficiently and flexibly in the task-level\namalgamation. Extensive experiments on benchmarks demonstrate the\ngeneralization capability of CKA in the amalgamation of specific task as well\nas multiple tasks. Comprehensive ablation studies provide a further insight\ninto our CKA.\n","authors":["Shangde Gao","Yichao Fu","Ke Liu","Yuqiang Han"],"pdf_url":"https://arxiv.org/pdf/2307.14781v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2112.07327 by other authors"},{"id":"http://arxiv.org/abs/2305.06716v2","updated":"2023-07-27T11:14:53Z","published":"2023-05-11T10:52:00Z","title":"Distracting Downpour: Adversarial Weather Attacks for Motion Estimation","summary":"  Current adversarial attacks on motion estimation, or optical flow, optimize\nsmall per-pixel perturbations, which are unlikely to appear in the real world.\nIn contrast, adverse weather conditions constitute a much more realistic threat\nscenario. Hence, in this work, we present a novel attack on motion estimation\nthat exploits adversarially optimized particles to mimic weather effects like\nsnowflakes, rain streaks or fog clouds. At the core of our attack framework is\na differentiable particle rendering system that integrates particles (i)\nconsistently over multiple time steps (ii) into the 3D space (iii) with a\nphoto-realistic appearance. Through optimization, we obtain adversarial weather\nthat significantly impacts the motion estimation. Surprisingly, methods that\npreviously showed good robustness towards small per-pixel perturbations are\nparticularly vulnerable to adversarial weather. At the same time, augmenting\nthe training with non-optimized weather increases a method's robustness towards\nweather effects and improves generalizability at almost no additional cost. Our\ncode will be available at https://github.com/cv-stuttgart/DistractingDownpour.\n","authors":["Jenny Schmalfuss","Lukas Mehl","Andrés Bruhn"],"pdf_url":"https://arxiv.org/pdf/2305.06716v2.pdf","comment":"Acepted by ICCV 2023. This work is a direct extension of our extended\n  abstract from arXiv:2210.11242"},{"id":"http://arxiv.org/abs/2307.14777v1","updated":"2023-07-27T11:12:48Z","published":"2023-07-27T11:12:48Z","title":"pCTFusion: Point Convolution-Transformer Fusion with Semantic Aware Loss\n  for Outdoor LiDAR Point Cloud Segmentation","summary":"  LiDAR-generated point clouds are crucial for perceiving outdoor environments.\nThe segmentation of point clouds is also essential for many applications.\nPrevious research has focused on using self-attention and convolution (local\nattention) mechanisms individually in semantic segmentation architectures.\nHowever, there is limited work on combining the learned representations of\nthese attention mechanisms to improve performance. Additionally, existing\nresearch that combines convolution with self-attention relies on global\nattention, which is not practical for processing large point clouds. To address\nthese challenges, this study proposes a new architecture, pCTFusion, which\ncombines kernel-based convolutions and self-attention mechanisms for better\nfeature learning and capturing local and global dependencies in segmentation.\nThe proposed architecture employs two types of self-attention mechanisms, local\nand global, based on the hierarchical positions of the encoder blocks.\nFurthermore, the existing loss functions do not consider the semantic and\nposition-wise importance of the points, resulting in reduced accuracy,\nparticularly at sharp class boundaries. To overcome this, the study models a\nnovel attention-based loss function called Pointwise Geometric Anisotropy\n(PGA), which assigns weights based on the semantic distribution of points in a\nneighborhood. The proposed architecture is evaluated on SemanticKITTI outdoor\ndataset and showed a 5-7% improvement in performance compared to the\nstate-of-the-art architectures. The results are particularly encouraging for\nminor classes, often misclassified due to class imbalance, lack of space, and\nneighbor-aware feature encoding. These developed methods can be leveraged for\nthe segmentation of complex datasets and can drive real-world applications of\nLiDAR point cloud.\n","authors":["Abhishek Kuriyal","Vaibhav Kumar","Bharat Lohani"],"pdf_url":"https://arxiv.org/pdf/2307.14777v1.pdf","comment":"22 pages, 8 Figures, 5 Tables"},{"id":"http://arxiv.org/abs/2307.14770v1","updated":"2023-07-27T11:02:36Z","published":"2023-07-27T11:02:36Z","title":"Learning Full-Head 3D GANs from a Single-View Portrait Dataset","summary":"  33D-aware face generators are commonly trained on 2D real-life face image\ndatasets. Nevertheless, existing facial recognition methods often struggle to\nextract face data captured from various camera angles. Furthermore, in-the-wild\nimages with diverse body poses introduce a high-dimensional challenge for\n3D-aware generators, making it difficult to utilize data that contains complete\nneck and shoulder regions. Consequently, these face image datasets often\ncontain only near-frontal face data, which poses challenges for 3D-aware face\ngenerators to construct \\textit{full-head} 3D portraits. To this end, we first\ncreate the dataset {$\\it{360}^{\\circ}$}-\\textit{Portrait}-\\textit{HQ}\n(\\textit{$\\it{360}^{\\circ}$PHQ}), which consists of high-quality single-view\nreal portraits annotated with a variety of camera parameters {(the yaw angles\nspan the entire $360^{\\circ}$ range)} and body poses. We then propose\n\\textit{3DPortraitGAN}, the first 3D-aware full-head portrait generator that\nlearns a canonical 3D avatar distribution from the body-pose-various\n\\textit{$\\it{360}^{\\circ}$PHQ} dataset with body pose self-learning. Our model\ncan generate view-consistent portrait images from all camera angles\n(${360}^{\\circ}$) with a full-head 3D representation. We incorporate a\nmesh-guided deformation field into volumetric rendering to produce deformed\nresults to generate portrait images that conform to the body pose distribution\nof the dataset using our canonical generator. We integrate two pose predictors\ninto our framework to predict more accurate body poses to address the issue of\ninaccurately estimated body poses in our dataset. Our experiments show that the\nproposed framework can generate view-consistent, realistic portrait images with\ncomplete geometry from all camera angles and accurately predict portrait body\npose.\n","authors":["Yiqian Wu","Hao Xu","Xiangjun Tang","Hongbo Fu","Xiaogang Jin"],"pdf_url":"https://arxiv.org/pdf/2307.14770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14768v1","updated":"2023-07-27T10:59:18Z","published":"2023-07-27T10:59:18Z","title":"Gloss-free Sign Language Translation: Improving from Visual-Language\n  Pretraining","summary":"  Sign Language Translation (SLT) is a challenging task due to its cross-domain\nnature, involving the translation of visual-gestural language to text. Many\nprevious methods employ an intermediate representation, i.e., gloss sequences,\nto facilitate SLT, thus transforming it into a two-stage task of sign language\nrecognition (SLR) followed by sign language translation (SLT). However, the\nscarcity of gloss-annotated sign language data, combined with the information\nbottleneck in the mid-level gloss representation, has hindered the further\ndevelopment of the SLT task. To address this challenge, we propose a novel\nGloss-Free SLT based on Visual-Language Pretraining (GFSLT-VLP), which improves\nSLT by inheriting language-oriented prior knowledge from pre-trained models,\nwithout any gloss annotation assistance. Our approach involves two stages: (i)\nintegrating Contrastive Language-Image Pre-training (CLIP) with masked\nself-supervised learning to create pre-tasks that bridge the semantic gap\nbetween visual and textual representations and restore masked sentences, and\n(ii) constructing an end-to-end architecture with an encoder-decoder-like\nstructure that inherits the parameters of the pre-trained Visual Encoder and\nText Decoder from the first stage. The seamless combination of these novel\ndesigns forms a robust sign language representation and significantly improves\ngloss-free sign language translation. In particular, we have achieved\nunprecedented improvements in terms of BLEU-4 score on the PHOENIX14T dataset\n(>+5) and the CSL-Daily dataset (>+3) compared to state-of-the-art gloss-free\nSLT methods. Furthermore, our approach also achieves competitive results on the\nPHOENIX14T dataset when compared with most of the gloss-based methods. Our code\nis available at https://github.com/zhoubenjia/GFSLT-VLP.\n","authors":["Benjia Zhou","Zhigang Chen","Albert Clapés","Jun Wan","Yanyan Liang","Sergio Escalera","Zhen Lei","Du Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.14768v1.pdf","comment":"Accepted to ICCV'23"},{"id":"http://arxiv.org/abs/2211.16762v4","updated":"2023-07-27T10:52:42Z","published":"2022-11-30T06:02:01Z","title":"GeoUDF: Surface Reconstruction from 3D Point Clouds via Geometry-guided\n  Distance Representation","summary":"  We present a learning-based method, namely GeoUDF,to tackle the long-standing\nand challenging problem of reconstructing a discrete surface from a sparse\npoint cloud.To be specific, we propose a geometry-guided learning method for\nUDF and its gradient estimation that explicitly formulates the unsigned\ndistance of a query point as the learnable affine averaging of its distances to\nthe tangent planes of neighboring points on the surface. Besides,we model the\nlocal geometric structure of the input point clouds by explicitly learning a\nquadratic polynomial for each point. This not only facilitates upsampling the\ninput sparse point cloud but also naturally induces unoriented normal, which\nfurther augments UDF estimation. Finally, to extract triangle meshes from the\npredicted UDF we propose a customized edge-based marching cube module. We\nconduct extensive experiments and ablation studies to demonstrate the\nsignificant advantages of our method over state-of-the-art methods in terms of\nreconstruction accuracy, efficiency, and generality. The source code is\npublicly available at https://github.com/rsy6318/GeoUDF.\n","authors":["Siyu Ren","Junhui Hou","Xiaodong Chen","Ying He","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2211.16762v4.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14066v2","updated":"2023-07-27T10:22:56Z","published":"2023-07-26T09:33:24Z","title":"Pre-Training with Diffusion models for Dental Radiography segmentation","summary":"  Medical radiography segmentation, and specifically dental radiography, is\nhighly limited by the cost of labeling which requires specific expertise and\nlabor-intensive annotations. In this work, we propose a straightforward\npre-training method for semantic segmentation leveraging Denoising Diffusion\nProbabilistic Models (DDPM), which have shown impressive results for generative\nmodeling. Our straightforward approach achieves remarkable performance in terms\nof label efficiency and does not require architectural modifications between\npre-training and downstream tasks. We propose to first pre-train a Unet by\nexploiting the DDPM training objective, and then fine-tune the resulting model\non a segmentation task. Our experimental results on the segmentation of dental\nradiographs demonstrate that the proposed method is competitive with\nstate-of-the-art pre-training methods.\n","authors":["Jérémy Rousseau","Christian Alaka","Emma Covili","Hippolyte Mayard","Laura Misrachi","Willy Au"],"pdf_url":"https://arxiv.org/pdf/2307.14066v2.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.14750v1","updated":"2023-07-27T10:16:13Z","published":"2023-07-27T10:16:13Z","title":"Exploring Annotation-free Image Captioning with Retrieval-augmented\n  Pseudo Sentence Generation","summary":"  Training an image captioner without annotated image-sentence pairs has gained\ntraction in recent years. Previous approaches can be categorized into two\nstrategies: crawling sentences from mismatching corpora and aligning them with\nthe given images as pseudo annotations, or pre-training the captioner using\nexternal image-text pairs. However, the aligning setting seems to reach its\nperformance limit due to the quality problem of pairs, and pre-training\nrequires significant computational resources. To address these challenges, we\npropose a new strategy ``LPM + retrieval-augmented learning\" where the prior\nknowledge from large pre-trained models (LPMs) is leveraged as supervision, and\na retrieval process is integrated to further reinforce its effectiveness.\nSpecifically, we introduce Retrieval-augmented Pseudo Sentence Generation\n(RaPSG), which adopts an efficient approach to retrieve highly relevant short\nregion descriptions from the mismatching corpora and use them to generate a\nvariety of pseudo sentences with distinct representations as well as high\nquality via LPMs. In addition, a fluency filter and a CLIP-guided training\nobjective are further introduced to facilitate model optimization. Experimental\nresults demonstrate that our method surpasses the SOTA pre-training model\n(Flamingo3B) by achieving a CIDEr score of 78.1 (+5.1) while utilizing only\n0.3% of its trainable parameters (1.3B VS 33M). Importantly, our approach\neliminates the need of computationally expensive pre-training processes on\nexternal datasets (e.g., the requirement of 312M image-text pairs for\nFlamingo3B). We further show that with a simple extension, the generated pseudo\nsentences can be deployed as weak supervision to boost the 1% semi-supervised\nimage caption benchmark up to 93.4 CIDEr score (+8.9) which showcases the\nversatility and effectiveness of our approach.\n","authors":["Zhiyuan Li","Dongnan Liu","Heng Wang","Chaoyi Zhang","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2307.14750v1.pdf","comment":"10 pages 5 figures"},{"id":"http://arxiv.org/abs/2307.14748v1","updated":"2023-07-27T10:12:17Z","published":"2023-07-27T10:12:17Z","title":"Semantic Image Completion and Enhancement using GANs","summary":"  Semantic inpainting or image completion alludes to the task of inferring\narbitrary large missing regions in images based on image semantics. Since the\nprediction of image pixels requires an indication of high-level context, this\nmakes it significantly tougher than image completion, which is often more\nconcerned with correcting data corruption and removing entire objects from the\ninput image. On the other hand, image enhancement attempts to eliminate\nunwanted noise and blur from the image, along with sustaining most of the image\ndetails. Efficient image completion and enhancement model should be able to\nrecover the corrupted and masked regions in images and then refine the image\nfurther to increase the quality of the output image. Generative Adversarial\nNetworks (GAN), have turned out to be helpful in picture completion tasks. In\nthis chapter, we will discuss the underlying GAN architecture and how they can\nbe used used for image completion tasks.\n","authors":["Priyansh Saxena","Raahat Gupta","Akshat Maheshwari","Saumil Maheshwari"],"pdf_url":"https://arxiv.org/pdf/2307.14748v1.pdf","comment":"This work is part of 'High-Performance Vision Intelligence'; Part of\n  the Studies in Computational Intelligence book series (SCI, volume 913) and\n  can be accessed at:\n  https://link.springer.com/chapter/10.1007/978-981-15-6844-2_11. arXiv admin\n  note: substantial text overlap with arXiv:1911.02222"},{"id":"http://arxiv.org/abs/2306.08541v2","updated":"2023-07-27T10:07:14Z","published":"2023-06-14T14:40:50Z","title":"Fine-Tuned but Zero-Shot 3D Shape Sketch View Similarity and Retrieval","summary":"  Recently, encoders like ViT (vision transformer) and ResNet have been trained\non vast datasets and utilized as perceptual metrics for comparing sketches and\nimages, as well as multi-domain encoders in a zero-shot setting. However, there\nhas been limited effort to quantify the granularity of these encoders. Our work\naddresses this gap by focusing on multi-modal 2D projections of individual 3D\ninstances. This task holds crucial implications for retrieval and sketch-based\nmodeling. We show that in a zero-shot setting, the more abstract the sketch,\nthe higher the likelihood of incorrect image matches. Even within the same\nsketch domain, sketches of the same object drawn in different styles, for\nexample by distinct individuals, might not be accurately matched. One of the\nkey findings of our research is that meticulous fine-tuning on one class of 3D\nshapes can lead to improved performance on other shape classes, reaching or\nsurpassing the accuracy of supervised methods. We compare and discuss several\nfine-tuning strategies. Additionally, we delve deeply into how the scale of an\nobject in a sketch influences the similarity of features at different network\nlayers, helping us identify which network layers provide the most accurate\nmatching. Significantly, we discover that ViT and ResNet perform best when\ndealing with similar object scales. We believe that our work will have a\nsignificant impact on research in the sketch domain, providing insights and\nguidance on how to adopt large pretrained models as perceptual losses.\n","authors":["Gianluca Berardi","Yulia Gryaditskaya"],"pdf_url":"https://arxiv.org/pdf/2306.08541v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.04812v6","updated":"2023-07-27T09:57:21Z","published":"2022-05-10T11:21:18Z","title":"The Impact of Partial Occlusion on Pedestrian Detectability","summary":"  Robust detection of vulnerable road users is a safety critical requirement\nfor the deployment of autonomous vehicles in heterogeneous traffic. One of the\nmost complex outstanding challenges is that of partial occlusion where a target\nobject is only partially available to the sensor due to obstruction by another\nforeground object. A number of leading pedestrian detection benchmarks provide\nannotation for partial occlusion, however each benchmark varies greatly in\ntheir definition of the occurrence and severity of occlusion. Recent research\ndemonstrates that a high degree of subjectivity is used to classify occlusion\nlevel in these cases and occlusion is typically categorized into 2 to 3 broad\ncategories such as partially and heavily occluded. This can lead to inaccurate\nor inconsistent reporting of pedestrian detection model performance depending\non which benchmark is used. This research introduces a novel, objective\nbenchmark for partially occluded pedestrian detection to facilitate the\nobjective characterization of pedestrian detection models. Characterization is\ncarried out on seven popular pedestrian detection models for a range of\nocclusion levels from 0-99%, in order to demonstrate the efficacy and increased\nanalysis capabilities of the proposed characterization method. Results\ndemonstrate that pedestrian detection performance degrades, and the number of\nfalse negative detections increase as pedestrian occlusion level increases. Of\nthe seven popular pedestrian detection routines characterized, CenterNet has\nthe greatest overall performance, followed by SSDlite. RetinaNet has the lowest\noverall detection performance across the range of occlusion levels.\n","authors":["Shane Gilroy","Darragh Mullins","Edward Jones","Ashkan Parsi","Martin Glavin"],"pdf_url":"https://arxiv.org/pdf/2205.04812v6.pdf","comment":"This research has been published under the title \"Replacing the human\n  driver: An objective benchmark for occluded pedestrian detection\" in\n  Biomimetic Intelligence and Robotics\n  https://doi.org/10.1016/j.birob.2023.100115"},{"id":"http://arxiv.org/abs/2307.09715v2","updated":"2023-07-27T09:55:05Z","published":"2023-07-19T01:57:31Z","title":"Semantic-Aware Dual Contrastive Learning for Multi-label Image\n  Classification","summary":"  Extracting image semantics effectively and assigning corresponding labels to\nmultiple objects or attributes for natural images is challenging due to the\ncomplex scene contents and confusing label dependencies. Recent works have\nfocused on modeling label relationships with graph and understanding object\nregions using class activation maps (CAM). However, these methods ignore the\ncomplex intra- and inter-category relationships among specific semantic\nfeatures, and CAM is prone to generate noisy information. To this end, we\npropose a novel semantic-aware dual contrastive learning framework that\nincorporates sample-to-sample contrastive learning (SSCL) as well as\nprototype-to-sample contrastive learning (PSCL). Specifically, we leverage\nsemantic-aware representation learning to extract category-related local\ndiscriminative features and construct category prototypes. Then based on SSCL,\nlabel-level visual representations of the same category are aggregated\ntogether, and features belonging to distinct categories are separated.\nMeanwhile, we construct a novel PSCL module to narrow the distance between\npositive samples and category prototypes and push negative samples away from\nthe corresponding category prototypes. Finally, the discriminative label-level\nfeatures related to the image content are accurately captured by the joint\ntraining of the above three parts. Experiments on five challenging large-scale\npublic datasets demonstrate that our proposed method is effective and\noutperforms the state-of-the-art methods. Code and supplementary materials are\nreleased on https://github.com/yu-gi-oh-leilei/SADCL.\n","authors":["Leilei Ma","Dengdi Sun","Lei Wang","Haifeng Zhao","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2307.09715v2.pdf","comment":"8 pages, 6 figures, accepted by European Conference on Artificial\n  Intelligence (2023 ECAI)"},{"id":"http://arxiv.org/abs/2307.14735v1","updated":"2023-07-27T09:43:06Z","published":"2023-07-27T09:43:06Z","title":"Test Time Adaptation for Blind Image Quality Assessment","summary":"  While the design of blind image quality assessment (IQA) algorithms has\nimproved significantly, the distribution shift between the training and testing\nscenarios often leads to a poor performance of these methods at inference time.\nThis motivates the study of test time adaptation (TTA) techniques to improve\ntheir performance at inference time. Existing auxiliary tasks and loss\nfunctions used for TTA may not be relevant for quality-aware adaptation of the\npre-trained model. In this work, we introduce two novel quality-relevant\nauxiliary tasks at the batch and sample levels to enable TTA for blind IQA. In\nparticular, we introduce a group contrastive loss at the batch level and a\nrelative rank loss at the sample level to make the model quality aware and\nadapt to the target data. Our experiments reveal that even using a small batch\nof images from the test distribution helps achieve significant improvement in\nperformance by updating the batch normalization statistics of the source model.\n","authors":["Subhadeep Roy","Shankhanil Mitra","Soma Biswas","Rajiv Soundararajan"],"pdf_url":"https://arxiv.org/pdf/2307.14735v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2304.01986v2","updated":"2023-07-27T09:37:19Z","published":"2023-04-04T17:45:06Z","title":"USTC FLICAR: A Sensors Fusion Dataset of LiDAR-Inertial-Camera for\n  Heavy-duty Autonomous Aerial Work Robots","summary":"  In this paper, we present the USTC FLICAR Dataset, which is dedicated to the\ndevelopment of simultaneous localization and mapping and precise 3D\nreconstruction of the workspace for heavy-duty autonomous aerial work robots.\nIn recent years, numerous public datasets have played significant roles in the\nadvancement of autonomous cars and unmanned aerial vehicles (UAVs). However,\nthese two platforms differ from aerial work robots: UAVs are limited in their\npayload capacity, while cars are restricted to two-dimensional movements. To\nfill this gap, we create the \"Giraffe\" mapping robot based on a bucket truck,\nwhich is equipped with a variety of well-calibrated and synchronized sensors:\nfour 3D LiDARs, two stereo cameras, two monocular cameras, Inertial Measurement\nUnits (IMUs), and a GNSS/INS system. A laser tracker is used to record the\nmillimeter-level ground truth positions. We also make its ground twin, the\n\"Okapi\" mapping robot, to gather data for comparison. The proposed dataset\nextends the typical autonomous driving sensing suite to aerial scenes,\ndemonstrating the potential of combining autonomous driving perception systems\nwith bucket trucks to create a versatile autonomous aerial working platform.\nMoreover, based on the Segment Anything Model (SAM), we produce the Semantic\nFLICAR dataset, which provides fine-grained semantic segmentation annotations\nfor multimodal continuous data in both temporal and spatial dimensions. The\ndataset is available for download at: https://ustc-flicar.github.io/.\n","authors":["Ziming Wang","Yujiang Liu","Yifan Duan","Xingchen Li","Xinran Zhang","Jianmin Ji","Erbao Dong","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.01986v2.pdf","comment":"23 pages, 34 figures"},{"id":"http://arxiv.org/abs/2307.14729v1","updated":"2023-07-27T09:35:56Z","published":"2023-07-27T09:35:56Z","title":"Understanding Silent Failures in Medical Image Classification","summary":"  To ensure the reliable use of classification systems in medical applications,\nit is crucial to prevent silent failures. This can be achieved by either\ndesigning classifiers that are robust enough to avoid failures in the first\nplace, or by detecting remaining failures using confidence scoring functions\n(CSFs). A predominant source of failures in image classification is\ndistribution shifts between training data and deployment data. To understand\nthe current state of silent failure prevention in medical imaging, we conduct\nthe first comprehensive analysis comparing various CSFs in four biomedical\ntasks and a diverse range of distribution shifts. Based on the result that none\nof the benchmarked CSFs can reliably prevent silent failures, we conclude that\na deeper understanding of the root causes of failures in the data is required.\nTo facilitate this, we introduce SF-Visuals, an interactive analysis tool that\nuses latent space clustering to visualize shifts and failures. On the basis of\nvarious examples, we demonstrate how this tool can help researchers gain\ninsight into the requirements for safe application of classification systems in\nthe medical domain. The open-source benchmark and tool are at:\nhttps://github.com/IML-DKFZ/sf-visuals.\n","authors":["Till J. Bungert","Levin Kobelke","Paul F. Jaeger"],"pdf_url":"https://arxiv.org/pdf/2307.14729v1.pdf","comment":"Accepted at MICCAI 23"},{"id":"http://arxiv.org/abs/2307.14726v1","updated":"2023-07-27T09:31:01Z","published":"2023-07-27T09:31:01Z","title":"P2C: Self-Supervised Point Cloud Completion from Single Partial Clouds","summary":"  Point cloud completion aims to recover the complete shape based on a partial\nobservation. Existing methods require either complete point clouds or multiple\npartial observations of the same object for learning. In contrast to previous\napproaches, we present Partial2Complete (P2C), the first self-supervised\nframework that completes point cloud objects using training samples consisting\nof only a single incomplete point cloud per object. Specifically, our framework\ngroups incomplete point clouds into local patches as input and predicts masked\npatches by learning prior information from different partial objects. We also\npropose Region-Aware Chamfer Distance to regularize shape mismatch without\nlimiting completion capability, and devise the Normal Consistency Constraint to\nincorporate a local planarity assumption, encouraging the recovered shape\nsurface to be continuous and complete. In this way, P2C no longer needs\nmultiple observations or complete point clouds as ground truth. Instead,\nstructural cues are learned from a category-specific dataset to complete\npartial point clouds of objects. We demonstrate the effectiveness of our\napproach on both synthetic ShapeNet data and real-world ScanNet data, showing\nthat P2C produces comparable results to methods trained with complete shapes,\nand outperforms methods learned with multiple partial observations. Code is\navailable at https://github.com/CuiRuikai/Partial2Complete.\n","authors":["Ruikai Cui","Shi Qiu","Saeed Anwar","Jiawei Liu","Chaoyue Xing","Jing Zhang","Nick Barnes"],"pdf_url":"https://arxiv.org/pdf/2307.14726v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14725v1","updated":"2023-07-27T09:30:22Z","published":"2023-07-27T09:30:22Z","title":"vox2vec: A Framework for Self-supervised Contrastive Learning of\n  Voxel-level Representations in Medical Images","summary":"  This paper introduces vox2vec - a contrastive method for self-supervised\nlearning (SSL) of voxel-level representations. vox2vec representations are\nmodeled by a Feature Pyramid Network (FPN): a voxel representation is a\nconcatenation of the corresponding feature vectors from different pyramid\nlevels. The FPN is pre-trained to produce similar representations for the same\nvoxel in different augmented contexts and distinctive representations for\ndifferent voxels. This results in unified multi-scale representations that\ncapture both global semantics (e.g., body part) and local semantics (e.g.,\ndifferent small organs or healthy versus tumor tissue). We use vox2vec to\npre-train a FPN on more than 6500 publicly available computed tomography\nimages. We evaluate the pre-trained representations by attaching simple heads\non top of them and training the resulting models for 22 segmentation tasks. We\nshow that vox2vec outperforms existing medical imaging SSL techniques in three\nevaluation setups: linear and non-linear probing and end-to-end fine-tuning.\nMoreover, a non-linear head trained on top of the frozen vox2vec\nrepresentations achieves competitive performance with the FPN trained from\nscratch while having 50 times fewer trainable parameters. The code is available\nat https://github.com/mishgon/vox2vec .\n","authors":["Mikhail Goncharov","Vera Soboleva","Anvar Kurmukov","Maxim Pisov","Mikhail Belyaev"],"pdf_url":"https://arxiv.org/pdf/2307.14725v1.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.14723v1","updated":"2023-07-27T09:23:22Z","published":"2023-07-27T09:23:22Z","title":"EFLNet: Enhancing Feature Learning for Infrared Small Target Detection","summary":"  Single-frame infrared small target detection is considered to be a\nchallenging task, due to the extreme imbalance between target and background,\nbounding box regression is extremely sensitive to infrared small targets, and\nsmall target information is easy to lose in the high-level semantic layer. In\nthis paper, we propose an enhancing feature learning network (EFLNet) based on\nYOLOv7 framework to solve these problems. First, we notice that there is an\nextremely imbalance between the target and the background in the infrared\nimage, which makes the model pay more attention to the background features,\nresulting in missed detection. To address this problem, we propose a new\nadaptive threshold focal loss function that adjusts the loss weight\nautomatically, compelling the model to allocate greater attention to target\nfeatures. Second, we introduce the normalized Gaussian Wasserstein distance to\nalleviate the difficulty of model convergence caused by the extreme sensitivity\nof the bounding box regression to infrared small targets. Finally, we\nincorporate a dynamic head mechanism into the network to enable adaptive\nlearning of the relative importance of each semantic layer. Experimental\nresults demonstrate our method can achieve better performance in the detection\nperformance of infrared small targets compared to state-of-the-art\ndeep-learning based methods.\n","authors":["Bo Yang","Xinyu Zhang","Jiahao Zhu","Jian Zhang","Dongjian Tian","Jun Luo","Mingliang Zhou","Yangjun Pi"],"pdf_url":"https://arxiv.org/pdf/2307.14723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14713v1","updated":"2023-07-27T09:09:28Z","published":"2023-07-27T09:09:28Z","title":"GaitMorph: Transforming Gait by Optimally Transporting Discrete Codes","summary":"  Gait, the manner of walking, has been proven to be a reliable biometric with\nuses in surveillance, marketing and security. A promising new direction for the\nfield is training gait recognition systems without explicit human annotations,\nthrough self-supervised learning approaches. Such methods are heavily reliant\non strong augmentations for the same walking sequence to induce more data\nvariability and to simulate additional walking variations. Current data\naugmentation schemes are heuristic and cannot provide the necessary data\nvariation as they are only able to provide simple temporal and spatial\ndistortions. In this work, we propose GaitMorph, a novel method to modify the\nwalking variation for an input gait sequence. Our method entails the training\nof a high-compression model for gait skeleton sequences that leverages\nunlabelled data to construct a discrete and interpretable latent space, which\npreserves identity-related features. Furthermore, we propose a method based on\noptimal transport theory to learn latent transport maps on the discrete\ncodebook that morph gait sequences between variations. We perform extensive\nexperiments and show that our method is suitable to synthesize additional views\nfor an input sequence.\n","authors":["Adrian Cosma","Emilian Radoi"],"pdf_url":"https://arxiv.org/pdf/2307.14713v1.pdf","comment":"4 Tables, 6 Figures, 1 Algorithm"},{"id":"http://arxiv.org/abs/2307.14710v1","updated":"2023-07-27T08:58:39Z","published":"2023-07-27T08:58:39Z","title":"Pre-training Vision Transformers with Very Limited Synthesized Images","summary":"  Formula-driven supervised learning (FDSL) is a pre-training method that\nrelies on synthetic images generated from mathematical formulae such as\nfractals. Prior work on FDSL has shown that pre-training vision transformers on\nsuch synthetic datasets can yield competitive accuracy on a wide range of\ndownstream tasks. These synthetic images are categorized according to the\nparameters in the mathematical formula that generate them. In the present work,\nwe hypothesize that the process for generating different instances for the same\ncategory in FDSL, can be viewed as a form of data augmentation. We validate\nthis hypothesis by replacing the instances with data augmentation, which means\nwe only need a single image per category. Our experiments shows that this\none-instance fractal database (OFDB) performs better than the original dataset\nwhere instances were explicitly generated. We further scale up OFDB to 21,000\ncategories and show that it matches, or even surpasses, the model pre-trained\non ImageNet-21k in ImageNet-1k fine-tuning. The number of images in OFDB is\n21k, whereas ImageNet-21k has 14M. This opens new possibilities for\npre-training vision transformers with much smaller datasets.\n","authors":["Ryo Nakamura1","Hirokatsu Kataoka","Sora Takashima","Edgar Josafat Martinez Noriega","Rio Yokota","Nakamasa Inoue"],"pdf_url":"https://arxiv.org/pdf/2307.14710v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14709v1","updated":"2023-07-27T08:58:05Z","published":"2023-07-27T08:58:05Z","title":"Taxonomy Adaptive Cross-Domain Adaptation in Medical Imaging via\n  Optimization Trajectory Distillation","summary":"  The success of automated medical image analysis depends on large-scale and\nexpert-annotated training sets. Unsupervised domain adaptation (UDA) has been\nraised as a promising approach to alleviate the burden of labeled data\ncollection. However, they generally operate under the closed-set adaptation\nsetting assuming an identical label set between the source and target domains,\nwhich is over-restrictive in clinical practice where new classes commonly exist\nacross datasets due to taxonomic inconsistency. While several methods have been\npresented to tackle both domain shifts and incoherent label sets, none of them\ntake into account the common characteristics of the two issues and consider the\nlearning dynamics along network training. In this work, we propose optimization\ntrajectory distillation, a unified approach to address the two technical\nchallenges from a new perspective. It exploits the low-rank nature of gradient\nspace and devises a dual-stream distillation algorithm to regularize the\nlearning dynamics of insufficiently annotated domain and classes with the\nexternal guidance obtained from reliable sources. Our approach resolves the\nissue of inadequate navigation along network optimization, which is the major\nobstacle in the taxonomy adaptive cross-domain adaptation scenario. We evaluate\nthe proposed method extensively on several tasks towards various endpoints with\nclinical and open-world significance. The results demonstrate its effectiveness\nand improvements over previous methods.\n","authors":["Jianan Fan","Dongnan Liu","Hang Chang","Heng Huang","Mei Chen","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2307.14709v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14705v1","updated":"2023-07-27T08:52:29Z","published":"2023-07-27T08:52:29Z","title":"High Dynamic Range Imaging via Visual Attention Modules","summary":"  Thanks to High Dynamic Range (HDR) imaging methods, the scope of photography\nhas seen profound changes recently. To be more specific, such methods try to\nreconstruct the lost luminosity of the real world caused by the limitation of\nregular cameras from the Low Dynamic Range (LDR) images. Additionally, although\nthe State-Of-The-Art methods in this topic perform well, they mainly\nconcentrate on combining different exposures and have less attention to\nextracting the informative parts of the images. Thus, this paper aims to\nintroduce a new model capable of incorporating information from the most\nvisible areas of each image extracted by a visual attention module (VAM), which\nis a result of a segmentation strategy. In particular, the model, based on a\ndeep learning architecture, utilizes the extracted areas to produce the final\nHDR image. The results demonstrate that our method outperformed most of the\nState-Of-The-Art algorithms.\n","authors":["Ali Reza Omrani","Davide Moroni"],"pdf_url":"https://arxiv.org/pdf/2307.14705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.10206v5","updated":"2023-07-27T08:49:55Z","published":"2021-03-18T12:17:38Z","title":"DanceFormer: Music Conditioned 3D Dance Generation with Parametric\n  Motion Transformer","summary":"  Generating 3D dances from music is an emerged research task that benefits a\nlot of applications in vision and graphics. Previous works treat this task as\nsequence generation, however, it is challenging to render a music-aligned\nlong-term sequence with high kinematic complexity and coherent movements. In\nthis paper, we reformulate it by a two-stage process, ie, a key pose generation\nand then an in-between parametric motion curve prediction, where the key poses\nare easier to be synchronized with the music beats and the parametric curves\ncan be efficiently regressed to render fluent rhythm-aligned movements. We\nnamed the proposed method as DanceFormer, which includes two cascading\nkinematics-enhanced transformer-guided networks (called DanTrans) that tackle\neach stage, respectively. Furthermore, we propose a large-scale music\nconditioned 3D dance dataset, called PhantomDance, that is accurately labeled\nby experienced animators rather than reconstruction or motion capture. This\ndataset also encodes dances as key poses and parametric motion curves apart\nfrom pose sequences, thus benefiting the training of our DanceFormer. Extensive\nexperiments demonstrate that the proposed method, even trained by existing\ndatasets, can generate fluent, performative, and music-matched 3D dances that\nsurpass previous works quantitatively and qualitatively. Moreover, the proposed\nDanceFormer, together with the PhantomDance dataset\n(https://github.com/libuyu/PhantomDanceDataset), are seamlessly compatible with\nindustrial animation software, thus facilitating the adaptation for various\ndownstream applications.\n","authors":["Buyu Li","Yongchi Zhao","Zhelun Shi","Lu Sheng"],"pdf_url":"https://arxiv.org/pdf/2103.10206v5.pdf","comment":"This is the version accepted by AAAI-22"},{"id":"http://arxiv.org/abs/2307.14701v1","updated":"2023-07-27T08:43:34Z","published":"2023-07-27T08:43:34Z","title":"MIM-OOD: Generative Masked Image Modelling for Out-of-Distribution\n  Detection in Medical Images","summary":"  Unsupervised Out-of-Distribution (OOD) detection consists in identifying\nanomalous regions in images leveraging only models trained on images of healthy\nanatomy. An established approach is to tokenize images and model the\ndistribution of tokens with Auto-Regressive (AR) models. AR models are used to\n1) identify anomalous tokens and 2) in-paint anomalous representations with\nin-distribution tokens. However, AR models are slow at inference time and prone\nto error accumulation issues which negatively affect OOD detection performance.\nOur novel method, MIM-OOD, overcomes both speed and error accumulation issues\nby replacing the AR model with two task-specific networks: 1) a transformer\noptimized to identify anomalous tokens and 2) a transformer optimized to\nin-paint anomalous tokens using masked image modelling (MIM). Our experiments\nwith brain MRI anomalies show that MIM-OOD substantially outperforms AR models\n(DICE 0.458 vs 0.301) while achieving a nearly 25x speedup (9.5s vs 244s).\n","authors":["Sergio {Naval Marimont}","Vasilis Siomos","Giacomo Tarroni"],"pdf_url":"https://arxiv.org/pdf/2307.14701v1.pdf","comment":"12 pages, 5 figures. Accepted in DGM4MICCAI workshop @ MICCAI 2023"},{"id":"http://arxiv.org/abs/2211.12292v3","updated":"2023-07-27T08:29:15Z","published":"2022-11-22T14:13:15Z","title":"Exemplar-free Continual Learning of Vision Transformers via Gated\n  Class-Attention and Cascaded Feature Drift Compensation","summary":"  We propose a new method for exemplar-free class incremental training of ViTs.\nThe main challenge of exemplar-free continual learning is maintaining\nplasticity of the learner without causing catastrophic forgetting of previously\nlearned tasks. This is often achieved via exemplar replay which can help\nrecalibrate previous task classifiers to the feature drift which occurs when\nlearning new tasks. Exemplar replay, however, comes at the cost of retaining\nsamples from previous tasks which for many applications may not be possible. To\naddress the problem of continual ViT training, we first propose gated\nclass-attention to minimize the drift in the final ViT transformer block. This\nmask-based gating is applied to class-attention mechanism of the last\ntransformer block and strongly regulates the weights crucial for previous\ntasks. Importantly, gated class-attention does not require the task-ID during\ninference, which distinguishes it from other parameter isolation methods.\nSecondly, we propose a new method of feature drift compensation that\naccommodates feature drift in the backbone when learning new tasks. The\ncombination of gated class-attention and cascaded feature drift compensation\nallows for plasticity towards new tasks while limiting forgetting of previous\nones. Extensive experiments performed on CIFAR-100, Tiny-ImageNet and\nImageNet100 demonstrate that our exemplar-free method obtains competitive\nresults when compared to rehearsal based ViT methods.\n","authors":["Marco Cotogni","Fei Yang","Claudio Cusano","Andrew D. Bagdanov","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2211.12292v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10705v3","updated":"2023-07-27T08:23:19Z","published":"2023-07-20T08:53:47Z","title":"TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and\n  Lane Segmentation in Self-Driving Cars","summary":"  Semantic segmentation is a common task in autonomous driving to understand\nthe surrounding environment. Driveable Area Segmentation and Lane Detection are\nparticularly important for safe and efficient navigation on the road. However,\noriginal semantic segmentation models are computationally expensive and require\nhigh-end hardware, which is not feasible for embedded systems in autonomous\nvehicles. This paper proposes a lightweight model for the driveable area and\nlane line segmentation. TwinLiteNet is designed cheaply but achieves accurate\nand efficient segmentation results. We evaluate TwinLiteNet on the BDD100K\ndataset and compare it with modern models. Experimental results show that our\nTwinLiteNet performs similarly to existing approaches, requiring significantly\nfewer computational resources. Specifically, TwinLiteNet achieves a mIoU score\nof 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task\nwith only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000.\nFurthermore, TwinLiteNet can run in real-time on embedded devices with limited\ncomputing power, especially since it achieves 60FPS on Jetson Xavier NX, making\nit an ideal solution for self-driving vehicles. Code is available:\nurl{https://github.com/chequanghuy/TwinLiteNet}.\n","authors":["Quang Huy Che","Dinh Phuc Nguyen","Minh Quan Pham","Duc Khai Lam"],"pdf_url":"https://arxiv.org/pdf/2307.10705v3.pdf","comment":"This paper has been submitted to the Conference on Multimedia\n  Analysis and Pattern Recognition (MAPR), which will be held in held in Quy\n  Nhon on October 5-6, 2023"},{"id":"http://arxiv.org/abs/2307.14682v1","updated":"2023-07-27T08:14:22Z","published":"2023-07-27T08:14:22Z","title":"Unified Adversarial Patch for Visible-Infrared Cross-modal Attacks in\n  the Physical World","summary":"  Physical adversarial attacks have put a severe threat to DNN-based object\ndetectors. To enhance security, a combination of visible and infrared sensors\nis deployed in various scenarios, which has proven effective in disabling\nexisting single-modal physical attacks. To further demonstrate the potential\nrisks in such cases, we design a unified adversarial patch that can perform\ncross-modal physical attacks, achieving evasion in both modalities\nsimultaneously with a single patch. Given the different imaging mechanisms of\nvisible and infrared sensors, our work manipulates patches' shape features,\nwhich can be captured in different modalities when they undergo changes. To\ndeal with challenges, we propose a novel boundary-limited shape optimization\napproach that aims to achieve compact and smooth shapes for the adversarial\npatch, making it easy to implement in the physical world. And a score-aware\niterative evaluation method is also introduced to balance the fooling degree\nbetween visible and infrared detectors during optimization, which guides the\nadversarial patch to iteratively reduce the predicted scores of the multi-modal\nsensors. Furthermore, we propose an Affine-Transformation-based enhancement\nstrategy that makes the learnable shape robust to various angles, thus\nmitigating the issue of shape deformation caused by different shooting angles\nin the real world. Our method is evaluated against several state-of-the-art\nobject detectors, achieving an Attack Success Rate (ASR) of over 80%. We also\ndemonstrate the effectiveness of our approach in physical-world scenarios under\nvarious settings, including different angles, distances, postures, and scenes\nfor both visible and infrared sensors.\n","authors":["Xingxing Wei","Yao Huang","Yitong Sun","Jie Yu"],"pdf_url":"https://arxiv.org/pdf/2307.14682v1.pdf","comment":"13 pages, 16 figures. arXiv admin note: substantial text overlap with\n  arXiv:2307.07859"},{"id":"http://arxiv.org/abs/2307.14659v1","updated":"2023-07-27T07:22:51Z","published":"2023-07-27T07:22:51Z","title":"LLDiffusion: Learning Degradation Representations in Diffusion Models\n  for Low-Light Image Enhancement","summary":"  Current deep learning methods for low-light image enhancement (LLIE)\ntypically rely on pixel-wise mapping learned from paired data. However, these\nmethods often overlook the importance of considering degradation\nrepresentations, which can lead to sub-optimal outcomes. In this paper, we\naddress this limitation by proposing a degradation-aware learning scheme for\nLLIE using diffusion models, which effectively integrates degradation and image\npriors into the diffusion process, resulting in improved image enhancement. Our\nproposed degradation-aware learning scheme is based on the understanding that\ndegradation representations play a crucial role in accurately modeling and\ncapturing the specific degradation patterns present in low-light images. To\nthis end, First, a joint learning framework for both image generation and image\nenhancement is presented to learn the degradation representations. Second, to\nleverage the learned degradation representations, we develop a Low-Light\nDiffusion model (LLDiffusion) with a well-designed dynamic diffusion module.\nThis module takes into account both the color map and the latent degradation\nrepresentations to guide the diffusion process. By incorporating these\nconditioning factors, the proposed LLDiffusion can effectively enhance\nlow-light images, considering both the inherent degradation patterns and the\ndesired color fidelity. Finally, we evaluate our proposed method on several\nwell-known benchmark datasets, including synthetic and real-world unpaired\ndatasets. Extensive experiments on public benchmarks demonstrate that our\nLLDiffusion outperforms state-of-the-art LLIE methods both quantitatively and\nqualitatively. The source code and pre-trained models are available at\nhttps://github.com/TaoWangzj/LLDiffusion.\n","authors":["Tao Wang","Kaihao Zhang","Ziqian Shao","Wenhan Luo","Bjorn Stenger","Tae-Kyun Kim","Wei Liu","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2307.14659v1.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.14648v1","updated":"2023-07-27T06:53:16Z","published":"2023-07-27T06:53:16Z","title":"Spatial-Frequency U-Net for Denoising Diffusion Probabilistic Models","summary":"  In this paper, we study the denoising diffusion probabilistic model (DDPM) in\nwavelet space, instead of pixel space, for visual synthesis. Considering the\nwavelet transform represents the image in spatial and frequency domains, we\ncarefully design a novel architecture SFUNet to effectively capture the\ncorrelation for both domains. Specifically, in the standard denoising U-Net for\npixel data, we supplement the 2D convolutions and spatial-only attention layers\nwith our spatial frequency-aware convolution and attention modules to jointly\nmodel the complementary information from spatial and frequency domains in\nwavelet data. Our new architecture can be used as a drop-in replacement to the\npixel-based network and is compatible with the vanilla DDPM training process.\nBy explicitly modeling the wavelet signals, we find our model is able to\ngenerate images with higher quality on CIFAR-10, FFHQ, LSUN-Bedroom, and\nLSUN-Church datasets, than the pixel-based counterpart.\n","authors":["Xin Yuan","Linjie Li","Jianfeng Wang","Zhengyuan Yang","Kevin Lin","Zicheng Liu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01669v2","updated":"2023-07-27T06:40:49Z","published":"2023-03-03T02:07:40Z","title":"Learning Common Rationale to Improve Self-Supervised Representation for\n  Fine-Grained Visual Recognition Problems","summary":"  Self-supervised learning (SSL) strategies have demonstrated remarkable\nperformance in various recognition tasks. However, both our preliminary\ninvestigation and recent studies suggest that they may be less effective in\nlearning representations for fine-grained visual recognition (FGVR) since many\nfeatures helpful for optimizing SSL objectives are not suitable for\ncharacterizing the subtle differences in FGVR. To overcome this issue, we\npropose learning an additional screening mechanism to identify discriminative\nclues commonly seen across instances and classes, dubbed as common rationales\nin this paper. Intuitively, common rationales tend to correspond to the\ndiscriminative patterns from the key parts of foreground objects. We show that\na common rationale detector can be learned by simply exploiting the GradCAM\ninduced from the SSL objective without using any pre-trained object parts or\nsaliency detectors, making it seamlessly to be integrated with the existing SSL\nprocess. Specifically, we fit the GradCAM with a branch with limited fitting\ncapacity, which allows the branch to capture the common rationales and discard\nthe less common discriminative patterns. At the test stage, the branch\ngenerates a set of spatial weights to selectively aggregate features\nrepresenting an instance. Extensive experimental results on four visual tasks\ndemonstrate that the proposed method can lead to a significant improvement in\ndifferent evaluation settings.\n","authors":["Yangyang Shu","Anton van den Hengel","Lingqiao Liu"],"pdf_url":"https://arxiv.org/pdf/2303.01669v2.pdf","comment":"CVPR 2023"},{"id":"http://arxiv.org/abs/2307.14638v1","updated":"2023-07-27T06:13:21Z","published":"2023-07-27T06:13:21Z","title":"EqGAN: Feature Equalization Fusion for Few-shot Image Generation","summary":"  Due to the absence of fine structure and texture information, existing\nfusion-based few-shot image generation methods suffer from unsatisfactory\ngeneration quality and diversity. To address this problem, we propose a novel\nfeature Equalization fusion Generative Adversarial Network (EqGAN) for few-shot\nimage generation. Unlike existing fusion strategies that rely on either deep\nfeatures or local representations, we design two separate branches to fuse\nstructures and textures by disentangling encoded features into shallow and deep\ncontents. To refine image contents at all feature levels, we equalize the fused\nstructure and texture semantics at different scales and supplement the decoder\nwith richer information by skip connections. Since the fused structures and\ntextures may be inconsistent with each other, we devise a consistent\nequalization loss between the equalized features and the intermediate output of\nthe decoder to further align the semantics. Comprehensive experiments on three\npublic datasets demonstrate that, EqGAN not only significantly improves\ngeneration performance with FID score (by up to 32.7%) and LPIPS score (by up\nto 4.19%), but also outperforms the state-of-the-arts in terms of accuracy (by\nup to 1.97%) for downstream classification tasks.\n","authors":["Yingbo Zhou","Zhihao Yue","Yutong Ye","Pengyu Zhang","Xian Wei","Mingsong Chen"],"pdf_url":"https://arxiv.org/pdf/2307.14638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.13381v4","updated":"2023-07-27T06:11:24Z","published":"2022-12-27T07:03:52Z","title":"MixupE: Understanding and Improving Mixup from Directional Derivative\n  Perspective","summary":"  Mixup is a popular data augmentation technique for training deep neural\nnetworks where additional samples are generated by linearly interpolating pairs\nof inputs and their labels. This technique is known to improve the\ngeneralization performance in many learning paradigms and applications. In this\nwork, we first analyze Mixup and show that it implicitly regularizes infinitely\nmany directional derivatives of all orders. Based on this new insight, we\npropose an improved version of Mixup, theoretically justified to deliver better\ngeneralization performance than the vanilla Mixup. To demonstrate the\neffectiveness of the proposed method, we conduct experiments across various\ndomains such as images, tabular data, speech, and graphs. Our results show that\nthe proposed method improves Mixup across multiple datasets using a variety of\narchitectures, for instance, exhibiting an improvement over Mixup by 0.8% in\nImageNet top-1 accuracy.\n","authors":["Yingtian Zou","Vikas Verma","Sarthak Mittal","Wai Hoh Tang","Hieu Pham","Juho Kannala","Yoshua Bengio","Arno Solin","Kenji Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2212.13381v4.pdf","comment":"16 pages, UAI 2023 oral presentation"},{"id":"http://arxiv.org/abs/2307.14637v1","updated":"2023-07-27T06:04:20Z","published":"2023-07-27T06:04:20Z","title":"HTNet for micro-expression recognition","summary":"  Facial expression is related to facial muscle contractions and different\nmuscle movements correspond to different emotional states. For micro-expression\nrecognition, the muscle movements are usually subtle, which has a negative\nimpact on the performance of current facial emotion recognition algorithms.\nMost existing methods use self-attention mechanisms to capture relationships\nbetween tokens in a sequence, but they do not take into account the inherent\nspatial relationships between facial landmarks. This can result in sub-optimal\nperformance on micro-expression recognition tasks.Therefore, learning to\nrecognize facial muscle movements is a key challenge in the area of\nmicro-expression recognition. In this paper, we propose a Hierarchical\nTransformer Network (HTNet) to identify critical areas of facial muscle\nmovement. HTNet includes two major components: a transformer layer that\nleverages the local temporal features and an aggregation layer that extracts\nlocal and global semantical facial features. Specifically, HTNet divides the\nface into four different facial areas: left lip area, left eye area, right eye\narea and right lip area. The transformer layer is used to focus on representing\nlocal minor muscle movement with local self-attention in each area. The\naggregation layer is used to learn the interactions between eye areas and lip\nareas. The experiments on four publicly available micro-expression datasets\nshow that the proposed approach outperforms previous methods by a large margin.\nThe codes and models are available at:\n\\url{https://github.com/wangzhifengharrison/HTNet}\n","authors":["Zhifeng Wang","Kaihao Zhang","Wenhan Luo","Ramesh Sankaranarayana"],"pdf_url":"https://arxiv.org/pdf/2307.14637v1.pdf","comment":"35 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.14634v1","updated":"2023-07-27T05:49:24Z","published":"2023-07-27T05:49:24Z","title":"Fact-Checking of AI-Generated Reports","summary":"  With advances in generative artificial intelligence (AI), it is now possible\nto produce realistic-looking automated reports for preliminary reads of\nradiology images. This can expedite clinical workflows, improve accuracy and\nreduce overall costs. However, it is also well-known that such models often\nhallucinate, leading to false findings in the generated reports. In this paper,\nwe propose a new method of fact-checking of AI-generated reports using their\nassociated images. Specifically, the developed examiner differentiates real and\nfake sentences in reports by learning the association between an image and\nsentences describing real or potentially fake findings. To train such an\nexaminer, we first created a new dataset of fake reports by perturbing the\nfindings in the original ground truth radiology reports associated with images.\nText encodings of real and fake sentences drawn from these reports are then\npaired with image encodings to learn the mapping to real/fake labels. The\nutility of such an examiner is demonstrated for verifying automatically\ngenerated reports by detecting and removing fake sentences. Future generative\nAI approaches can use the resulting tool to validate their reports leading to a\nmore responsible use of AI in expediting clinical workflows.\n","authors":["Razi Mahmood","Ge Wang","Mannudeep Kalra","Pingkun Yan"],"pdf_url":"https://arxiv.org/pdf/2307.14634v1.pdf","comment":"10 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2307.10249v2","updated":"2023-07-27T05:42:12Z","published":"2023-07-17T07:22:25Z","title":"RCM-Fusion: Radar-Camera Multi-Level Fusion for 3D Object Detection","summary":"  While LiDAR sensors have been succesfully applied to 3D object detection, the\naffordability of radar and camera sensors has led to a growing interest in\nfusiong radars and cameras for 3D object detection. However, previous\nradar-camera fusion models have not been able to fully utilize radar\ninformation in that initial 3D proposals were generated based on the camera\nfeatures only and the instance-level fusion is subsequently conducted. In this\npaper, we propose radar-camera multi-level fusion (RCM-Fusion), which fuses\nradar and camera modalities at both the feature-level and instance-level to\nfully utilize radar information. At the feature-level, we propose a Radar\nGuided BEV Encoder which utilizes radar Bird's-Eye-View (BEV) features to\ntransform image features into precise BEV representations and then adaptively\ncombines the radar and camera BEV features. At the instance-level, we propose a\nRadar Grid Point Refinement module that reduces localization error by\nconsidering the characteristics of the radar point clouds. The experiments\nconducted on the public nuScenes dataset demonstrate that our proposed\nRCM-Fusion offers 11.8% performance gain in nuScenes detection score (NDS) over\nthe camera-only baseline model and achieves state-of-the-art performaces among\nradar-camera fusion methods in the nuScenes 3D object detection benchmark. Code\nwill be made publicly available.\n","authors":["Jisong Kim","Minjae Seong","Geonho Bang","Dongsuk Kum","Jun Won Choi"],"pdf_url":"https://arxiv.org/pdf/2307.10249v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.14630v1","updated":"2023-07-27T05:32:01Z","published":"2023-07-27T05:32:01Z","title":"360VOT: A New Benchmark Dataset for Omnidirectional Visual Object\n  Tracking","summary":"  360{\\deg} images can provide an omnidirectional field of view which is\nimportant for stable and long-term scene perception. In this paper, we explore\n360{\\deg} images for visual object tracking and perceive new challenges caused\nby large distortion, stitching artifacts, and other unique attributes of\n360{\\deg} images. To alleviate these problems, we take advantage of novel\nrepresentations of target localization, i.e., bounding field-of-view, and then\nintroduce a general 360 tracking framework that can adopt typical trackers for\nomnidirectional tracking. More importantly, we propose a new large-scale\nomnidirectional tracking benchmark dataset, 360VOT, in order to facilitate\nfuture research. 360VOT contains 120 sequences with up to 113K high-resolution\nframes in equirectangular projection. The tracking targets cover 32 categories\nin diverse scenarios. Moreover, we provide 4 types of unbiased ground truth,\nincluding (rotated) bounding boxes and (rotated) bounding field-of-views, as\nwell as new metrics tailored for 360{\\deg} images which allow for the accurate\nevaluation of omnidirectional tracking performance. Finally, we extensively\nevaluated 20 state-of-the-art visual trackers and provided a new baseline for\nfuture comparisons. Homepage: https://360vot.hkustvgd.com\n","authors":["Huajian Huang","Yinzhe Xu","Yingshu Chen","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2307.14630v1.pdf","comment":"ICCV 2023. Homepage: https://360vot.hkustvgd.com The toolkit of the\n  benchmark is available at: https://github.com/HuajianUP/360VOT"},{"id":"http://arxiv.org/abs/2302.01226v3","updated":"2023-07-27T05:29:14Z","published":"2023-02-02T17:06:50Z","title":"Factor Fields: A Unified Framework for Neural Fields and Beyond","summary":"  We present Factor Fields, a novel framework for modeling and representing\nsignals. Factor Fields decomposes a signal into a product of factors, each\nrepresented by a classical or neural field representation which operates on\ntransformed input coordinates. This decomposition results in a unified\nframework that accommodates several recent signal representations including\nNeRF, Plenoxels, EG3D, Instant-NGP, and TensoRF. Additionally, our framework\nallows for the creation of powerful new signal representations, such as the\n\"Dictionary Field\" (DiF) which is a second contribution of this paper. Our\nexperiments show that DiF leads to improvements in approximation quality,\ncompactness, and training time when compared to previous fast reconstruction\nmethods. Experimentally, our representation achieves better image approximation\nquality on 2D image regression tasks, higher geometric quality when\nreconstructing 3D signed distance fields, and higher compactness for radiance\nfield reconstruction tasks. Furthermore, DiF enables generalization to unseen\nimages/3D scenes by sharing bases across signals during training which greatly\nbenefits use cases such as image regression from sparse observations and\nfew-shot radiance field reconstruction.\n","authors":["Anpei Chen","Zexiang Xu","Xinyue Wei","Siyu Tang","Hao Su","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2302.01226v3.pdf","comment":"13 pages, 7 figures; Project Page:\n  https://apchenstu.github.io/FactorFields/"},{"id":"http://arxiv.org/abs/2307.14624v1","updated":"2023-07-27T04:49:36Z","published":"2023-07-27T04:49:36Z","title":"FS-Depth: Focal-and-Scale Depth Estimation from a Single Image in Unseen\n  Indoor Scene","summary":"  It has long been an ill-posed problem to predict absolute depth maps from\nsingle images in real (unseen) indoor scenes. We observe that it is essentially\ndue to not only the scale-ambiguous problem but also the focal-ambiguous\nproblem that decreases the generalization ability of monocular depth\nestimation. That is, images may be captured by cameras of different focal\nlengths in scenes of different scales. In this paper, we develop a\nfocal-and-scale depth estimation model to well learn absolute depth maps from\nsingle images in unseen indoor scenes. First, a relative depth estimation\nnetwork is adopted to learn relative depths from single images with diverse\nscales/semantics. Second, multi-scale features are generated by mapping a\nsingle focal length value to focal length features and concatenating them with\nintermediate features of different scales in relative depth estimation.\nFinally, relative depths and multi-scale features are jointly fed into an\nabsolute depth estimation network. In addition, a new pipeline is developed to\naugment the diversity of focal lengths of public datasets, which are often\ncaptured with cameras of the same or similar focal lengths. Our model is\ntrained on augmented NYUDv2 and tested on three unseen datasets. Our model\nconsiderably improves the generalization ability of depth estimation by 41%/13%\n(RMSE) with/without data augmentation compared with five recent SOTAs and well\nalleviates the deformation problem in 3D reconstruction. Notably, our model\nwell maintains the accuracy of depth estimation on original NYUDv2.\n","authors":["Chengrui Wei","Meng Yang","Lei He","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.14624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14620v1","updated":"2023-07-27T04:36:16Z","published":"2023-07-27T04:36:16Z","title":"NeRF-Det: Learning Geometry-Aware Volumetric Representation for\n  Multi-View 3D Object Detection","summary":"  We present NeRF-Det, a novel method for indoor 3D detection with posed RGB\nimages as input. Unlike existing indoor 3D detection methods that struggle to\nmodel scene geometry, our method makes novel use of NeRF in an end-to-end\nmanner to explicitly estimate 3D geometry, thereby improving 3D detection\nperformance. Specifically, to avoid the significant extra latency associated\nwith per-scene optimization of NeRF, we introduce sufficient geometry priors to\nenhance the generalizability of NeRF-MLP. Furthermore, we subtly connect the\ndetection and NeRF branches through a shared MLP, enabling an efficient\nadaptation of NeRF to detection and yielding geometry-aware volumetric\nrepresentations for 3D detection. Our method outperforms state-of-the-arts by\n3.9 mAP and 3.1 mAP on the ScanNet and ARKITScenes benchmarks, respectively. We\nprovide extensive analysis to shed light on how NeRF-Det works. As a result of\nour joint-training design, NeRF-Det is able to generalize well to unseen scenes\nfor object detection, view synthesis, and depth estimation tasks without\nrequiring per-scene optimization. Code is available at\n\\url{https://github.com/facebookresearch/NeRF-Det}.\n","authors":["Chenfeng Xu","Bichen Wu","Ji Hou","Sam Tsai","Ruilong Li","Jialiang Wang","Wei Zhan","Zijian He","Peter Vajda","Kurt Keutzer","Masayoshi Tomizuka"],"pdf_url":"https://arxiv.org/pdf/2307.14620v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2305.09160v2","updated":"2023-07-27T04:36:15Z","published":"2023-05-16T04:36:04Z","title":"SUG: Single-dataset Unified Generalization for 3D Point Cloud\n  Classification","summary":"  Although Domain Generalization (DG) problem has been fast-growing in the 2D\nimage tasks, its exploration on 3D point cloud data is still insufficient and\nchallenged by more complex and uncertain cross-domain variances with uneven\ninter-class modality distribution. In this paper, different from previous 2D DG\nworks, we focus on the 3D DG problem and propose a Single-dataset Unified\nGeneralization (SUG) framework that only leverages a single source dataset to\nalleviate the unforeseen domain differences faced by a well-trained source\nmodel. Specifically, we first design a Multi-grained Sub-domain Alignment (MSA)\nmethod, which can constrain the learned representations to be domain-agnostic\nand discriminative, by performing a multi-grained feature alignment process\nbetween the splitted sub-domains from the single source dataset. Then, a\nSample-level Domain-aware Attention (SDA) strategy is presented, which can\nselectively enhance easy-to-adapt samples from different sub-domains according\nto the sample-level inter-domain distance to avoid the negative transfer.\nExperiments demonstrate that our SUG can boost the generalization ability for\nunseen target domains, even outperforming the existing unsupervised domain\nadaptation methods that have to access extensive target domain data. Our code\nis available at https://github.com/SiyuanHuang95/SUG.\n","authors":["Siyuan Huang","Bo Zhang","Botian Shi","Peng Gao","Yikang Li","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2305.09160v2.pdf","comment":"Accepted by ACM MM-2023, and our code is available at\n  https://github.com/SiyuanHuang95/SUG"},{"id":"http://arxiv.org/abs/2307.01524v2","updated":"2023-07-27T04:33:28Z","published":"2023-07-04T07:10:39Z","title":"Exploiting Richness of Learned Compressed Representation of Images for\n  Semantic Segmentation","summary":"  Autonomous vehicles and Advanced Driving Assistance Systems (ADAS) have the\npotential to radically change the way we travel. Many such vehicles currently\nrely on segmentation and object detection algorithms to detect and track\nobjects around its surrounding. The data collected from the vehicles are often\nsent to cloud servers to facilitate continual/life-long learning of these\nalgorithms. Considering the bandwidth constraints, the data is compressed\nbefore sending it to servers, where it is typically decompressed for training\nand analysis. In this work, we propose the use of a learning-based compression\nCodec to reduce the overhead in latency incurred for the decompression\noperation in the standard pipeline. We demonstrate that the learned compressed\nrepresentation can also be used to perform tasks like semantic segmentation in\naddition to decompression to obtain the images. We experimentally validate the\nproposed pipeline on the Cityscapes dataset, where we achieve a compression\nfactor up to $66 \\times$ while preserving the information required to perform\nsegmentation with a dice coefficient of $0.84$ as compared to $0.88$ achieved\nusing decompressed images while reducing the overall compute by $11\\%$.\n","authors":["Ravi Kakaiya","Rakshith Sathish","Ramanathan Sethuraman","Debdoot Sheet"],"pdf_url":"https://arxiv.org/pdf/2307.01524v2.pdf","comment":"Accepted at ICME 2023 (Industry Track)"},{"id":"http://arxiv.org/abs/2307.14127v2","updated":"2023-07-27T04:21:52Z","published":"2023-07-26T11:47:44Z","title":"Creative Birds: Self-Supervised Single-View 3D Style Transfer","summary":"  In this paper, we propose a novel method for single-view 3D style transfer\nthat generates a unique 3D object with both shape and texture transfer. Our\nfocus lies primarily on birds, a popular subject in 3D reconstruction, for\nwhich no existing single-view 3D transfer methods have been developed.The\nmethod we propose seeks to generate a 3D mesh shape and texture of a bird from\ntwo single-view images. To achieve this, we introduce a novel shape transfer\ngenerator that comprises a dual residual gated network (DRGNet), and a\nmulti-layer perceptron (MLP). DRGNet extracts the features of source and target\nimages using a shared coordinate gate unit, while the MLP generates spatial\ncoordinates for building a 3D mesh. We also introduce a semantic UV texture\ntransfer module that implements textural style transfer using semantic UV\nsegmentation, which ensures consistency in the semantic meaning of the\ntransferred regions. This module can be widely adapted to many existing\napproaches. Finally, our method constructs a novel 3D bird using a\ndifferentiable renderer. Experimental results on the CUB dataset verify that\nour method achieves state-of-the-art performance on the single-view 3D style\ntransfer task. Code is available in https://github.com/wrk226/creative_birds.\n","authors":["Renke Wang","Guimin Que","Shuo Chen","Xiang Li","Jun Li","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2307.14127v2.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14617v1","updated":"2023-07-27T04:18:08Z","published":"2023-07-27T04:18:08Z","title":"Multiscale Dynamic Graph Representation for Biometric Recognition with\n  Occlusions","summary":"  Occlusion is a common problem with biometric recognition in the wild. The\ngeneralization ability of CNNs greatly decreases due to the adverse effects of\nvarious occlusions. To this end, we propose a novel unified framework\nintegrating the merits of both CNNs and graph models to overcome occlusion\nproblems in biometric recognition, called multiscale dynamic graph\nrepresentation (MS-DGR). More specifically, a group of deep features reflected\non certain subregions is recrafted into a feature graph (FG). Each node inside\nthe FG is deemed to characterize a specific local region of the input sample,\nand the edges imply the co-occurrence of non-occluded regions. By analyzing the\nsimilarities of the node representations and measuring the topological\nstructures stored in the adjacent matrix, the proposed framework leverages\ndynamic graph matching to judiciously discard the nodes corresponding to the\noccluded parts. The multiscale strategy is further incorporated to attain more\ndiverse nodes representing regions of various sizes. Furthermore, the proposed\nframework exhibits a more illustrative and reasonable inference by showing the\npaired nodes. Extensive experiments demonstrate the superiority of the proposed\nframework, which boosts the accuracy in both natural and occlusion-simulated\ncases by a large margin compared with that of baseline methods.\n","authors":["Min Ren","Yunlong Wang","Yuhao Zhu","Kunbo Zhang","Zhenan Sun"],"pdf_url":"https://arxiv.org/pdf/2307.14617v1.pdf","comment":"Accepted by TPAMI"},{"id":"http://arxiv.org/abs/2307.14016v2","updated":"2023-07-27T04:15:04Z","published":"2023-07-26T07:57:56Z","title":"RPG-Palm: Realistic Pseudo-data Generation for Palmprint Recognition","summary":"  Palmprint recently shows great potential in recognition applications as it is\na privacy-friendly and stable biometric. However, the lack of large-scale\npublic palmprint datasets limits further research and development of palmprint\nrecognition. In this paper, we propose a novel realistic pseudo-palmprint\ngeneration (RPG) model to synthesize palmprints with massive identities. We\nfirst introduce a conditional modulation generator to improve the intra-class\ndiversity. Then an identity-aware loss is proposed to ensure identity\nconsistency against unpaired training. We further improve the B\\'ezier palm\ncreases generation strategy to guarantee identity independence. Extensive\nexperimental results demonstrate that synthetic pretraining significantly\nboosts the recognition model performance. For example, our model improves the\nstate-of-the-art B\\'ezierPalm by more than $5\\%$ and $14\\%$ in terms of\nTAR@FAR=1e-6 under the $1:1$ and $1:3$ Open-set protocol. When accessing only\n$10\\%$ of the real training data, our method still outperforms ArcFace with\n$100\\%$ real training data, indicating that we are closer to real-data-free\npalmprint recognition.\n","authors":["Lei Shen","Jianlong Jin","Ruixin Zhang","Huaen Li","Kai Zhao","Yingyi Zhang","Jingyun Zhang","Shouhong Ding","Yang Zhao","Wei Jia"],"pdf_url":"https://arxiv.org/pdf/2307.14016v2.pdf","comment":"12 pages,8 figures"},{"id":"http://arxiv.org/abs/2307.14612v1","updated":"2023-07-27T03:59:19Z","published":"2023-07-27T03:59:19Z","title":"GenCo: An Auxiliary Generator from Contrastive Learning for Enhanced\n  Few-Shot Learning in Remote Sensing","summary":"  Classifying and segmenting patterns from a limited number of examples is a\nsignificant challenge in remote sensing and earth observation due to the\ndifficulty in acquiring accurately labeled data in large quantities. Previous\nstudies have shown that meta-learning, which involves episodic training on\nquery and support sets, is a promising approach. However, there has been little\nattention paid to direct fine-tuning techniques. This paper repurposes\ncontrastive learning as a pre-training method for few-shot learning for\nclassification and semantic segmentation tasks. Specifically, we introduce a\ngenerator-based contrastive learning framework (GenCo) that pre-trains\nbackbones and simultaneously explores variants of feature samples. In\nfine-tuning, the auxiliary generator can be used to enrich limited labeled data\nsamples in feature space. We demonstrate the effectiveness of our method in\nimproving few-shot learning performance on two key remote sensing datasets:\nAgriculture-Vision and EuroSAT. Empirically, our approach outperforms purely\nsupervised training on the nearly 95,000 images in Agriculture-Vision for both\nclassification and semantic segmentation tasks. Similarly, the proposed\nfew-shot method achieves better results on the land-cover classification task\non EuroSAT compared to the results obtained from fully supervised model\ntraining on the dataset.\n","authors":["Jing Wu","Naira Hovakimyan","Jennifer Hobbs"],"pdf_url":"https://arxiv.org/pdf/2307.14612v1.pdf","comment":"European Conference on Artificial Intelligence (ECAI), 2023"},{"id":"http://arxiv.org/abs/2307.14611v1","updated":"2023-07-27T03:56:39Z","published":"2023-07-27T03:56:39Z","title":"TextManiA: Enriching Visual Feature by Text-driven Manifold Augmentation","summary":"  Recent label mix-based augmentation methods have shown their effectiveness in\ngeneralization despite their simplicity, and their favorable effects are often\nattributed to semantic-level augmentation. However, we found that they are\nvulnerable to highly skewed class distribution, because scarce data classes are\nrarely sampled for inter-class perturbation. We propose TextManiA, a\ntext-driven manifold augmentation method that semantically enriches visual\nfeature spaces, regardless of data distribution. TextManiA augments visual data\nwith intra-class semantic perturbation by exploiting easy-to-understand\nvisually mimetic words, i.e., attributes. To this end, we bridge between the\ntext representation and a target visual feature space, and propose an efficient\nvector augmentation. To empirically support the validity of our design, we\ndevise two visualization-based analyses and show the plausibility of the bridge\nbetween two different modality spaces. Our experiments demonstrate that\nTextManiA is powerful in scarce samples with class imbalance as well as even\ndistribution. We also show compatibility with the label mix-based approaches in\nevenly distributed scarce data.\n","authors":["Moon Ye-Bin","Jisoo Kim","Hongyeob Kim","Kilho Son","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2307.14611v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2304.00570v2","updated":"2023-07-27T03:48:46Z","published":"2023-04-02T16:39:59Z","title":"FedFTN: Personalized Federated Learning with Deep Feature Transformation\n  Network for Multi-institutional Low-count PET Denoising","summary":"  Low-count PET is an efficient way to reduce radiation exposure and\nacquisition time, but the reconstructed images often suffer from low\nsignal-to-noise ratio (SNR), thus affecting diagnosis and other downstream\ntasks. Recent advances in deep learning have shown great potential in improving\nlow-count PET image quality, but acquiring a large, centralized, and diverse\ndataset from multiple institutions for training a robust model is difficult due\nto privacy and security concerns of patient data. Moreover, low-count PET data\nat different institutions may have different data distribution, thus requiring\npersonalized models. While previous federated learning (FL) algorithms enable\nmulti-institution collaborative training without the need of aggregating local\ndata, addressing the large domain shift in the application of\nmulti-institutional low-count PET denoising remains a challenge and is still\nhighly under-explored. In this work, we propose FedFTN, a personalized\nfederated learning strategy that addresses these challenges. FedFTN uses a\nlocal deep feature transformation network (FTN) to modulate the feature outputs\nof a globally shared denoising network, enabling personalized low-count PET\ndenoising for each institution. During the federated learning process, only the\ndenoising network's weights are communicated and aggregated, while the FTN\nremains at the local institutions for feature transformation. We evaluated our\nmethod using a large-scale dataset of multi-institutional low-count PET imaging\ndata from three medical centers located across three continents, and showed\nthat FedFTN provides high-quality low-count PET images, outperforming previous\nbaseline FL reconstruction methods across all low-count levels at all three\ninstitutions.\n","authors":["Bo Zhou","Huidong Xie","Qiong Liu","Xiongchao Chen","Xueqi Guo","Zhicheng Feng","S. Kevin Zhou","Biao Li","Axel Rominger","Kuangyu Shi","James S. Duncan","Chi Liu"],"pdf_url":"https://arxiv.org/pdf/2304.00570v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14605v1","updated":"2023-07-27T03:42:12Z","published":"2023-07-27T03:42:12Z","title":"Clustering based Point Cloud Representation Learning for 3D Analysis","summary":"  Point cloud analysis (such as 3D segmentation and detection) is a challenging\ntask, because of not only the irregular geometries of many millions of\nunordered points, but also the great variations caused by depth, viewpoint,\nocclusion, etc. Current studies put much focus on the adaption of neural\nnetworks to the complex geometries of point clouds, but are blind to a\nfundamental question: how to learn an appropriate point embedding space that is\naware of both discriminative semantics and challenging variations? As a\nresponse, we propose a clustering based supervised learning scheme for point\ncloud analysis. Unlike current de-facto, scene-wise training paradigm, our\nalgorithm conducts within-class clustering on the point embedding space for\nautomatically discovering subclass patterns which are latent yet representative\nacross scenes. The mined patterns are, in turn, used to repaint the embedding\nspace, so as to respect the underlying distribution of the entire training\ndataset and improve the robustness to the variations. Our algorithm is\nprincipled and readily pluggable to modern point cloud segmentation networks\nduring training, without extra overhead during testing. With various 3D network\narchitectures (i.e., voxel-based, point-based, Transformer-based, automatically\nsearched), our algorithm shows notable improvements on famous point cloud\nsegmentation datasets (i.e.,2.0-2.6% on single-scan and 2.0-2.2% multi-scan of\nSemanticKITTI, 1.8-1.9% on S3DIS, in terms of mIoU). Our algorithm also\ndemonstrates utility in 3D detection, showing 2.0-3.4% mAP gains on KITTI.\n","authors":["Tuo Feng","Wenguan Wang","Xiaohan Wang","Yi Yang","Qinghua Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.14605v1.pdf","comment":"Accepted at ICCV 2023; Project page:\n  https://github.com/FengZicai/Cluster3Dseg/"},{"id":"http://arxiv.org/abs/2307.11411v3","updated":"2023-07-27T03:37:11Z","published":"2023-07-21T08:10:26Z","title":"Deep Directly-Trained Spiking Neural Networks for Object Detection","summary":"  Spiking neural networks (SNNs) are brain-inspired energy-efficient models\nthat encode information in spatiotemporal dynamics. Recently, deep SNNs trained\ndirectly have shown great success in achieving high performance on\nclassification tasks with very few time steps. However, how to design a\ndirectly-trained SNN for the regression task of object detection still remains\na challenging problem. To address this problem, we propose EMS-YOLO, a novel\ndirectly-trained SNN framework for object detection, which is the first trial\nto train a deep SNN with surrogate gradients for object detection rather than\nANN-SNN conversion strategies. Specifically, we design a full-spike residual\nblock, EMS-ResNet, which can effectively extend the depth of the\ndirectly-trained SNN with low power consumption. Furthermore, we theoretically\nanalyze and prove the EMS-ResNet could avoid gradient vanishing or exploding.\nThe results demonstrate that our approach outperforms the state-of-the-art\nANN-SNN conversion methods (at least 500 time steps) in extremely fewer time\nsteps (only 4 time steps). It is shown that our model could achieve comparable\nperformance to the ANN with the same architecture while consuming 5.83 times\nless energy on the frame-based COCO Dataset and the event-based Gen1 Dataset.\n","authors":["Qiaoyi Su","Yuhong Chou","Yifan Hu","Jianing Li","Shijie Mei","Ziyang Zhang","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2307.11411v3.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.14603v1","updated":"2023-07-27T03:25:09Z","published":"2023-07-27T03:25:09Z","title":"A Weakly Supervised Segmentation Network Embedding Cross-scale Attention\n  Guidance and Noise-sensitive Constraint for Detecting Tertiary Lymphoid\n  Structures of Pancreatic Tumors","summary":"  The presence of tertiary lymphoid structures (TLSs) on pancreatic\npathological images is an important prognostic indicator of pancreatic tumors.\nTherefore, TLSs detection on pancreatic pathological images plays a crucial\nrole in diagnosis and treatment for patients with pancreatic tumors. However,\nfully supervised detection algorithms based on deep learning usually require a\nlarge number of manual annotations, which is time-consuming and\nlabor-intensive. In this paper, we aim to detect the TLSs in a manner of\nfew-shot learning by proposing a weakly supervised segmentation network. We\nfirstly obtain the lymphocyte density maps by combining a pretrained model for\nnuclei segmentation and a domain adversarial network for lymphocyte nuclei\nrecognition. Then, we establish a cross-scale attention guidance mechanism by\njointly learning the coarse-scale features from the original histopathology\nimages and fine-scale features from our designed lymphocyte density attention.\nA noise-sensitive constraint is introduced by an embedding signed distance\nfunction loss in the training procedure to reduce tiny prediction errors.\nExperimental results on two collected datasets demonstrate that our proposed\nmethod significantly outperforms the state-of-the-art segmentation-based\nalgorithms in terms of TLSs detection accuracy. Additionally, we apply our\nmethod to study the congruent relationship between the density of TLSs and\nperipancreatic vascular invasion and obtain some clinically statistical\nresults.\n","authors":["Bingxue Wang","Liwen Zou","Jun Chen","Yingying Cao","Zhenghua Cai","Yudong Qiu","Liang Mao","Zhongqiu Wang","Jingya Chen","Luying Gui","Xiaoping Yang"],"pdf_url":"https://arxiv.org/pdf/2307.14603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14593v1","updated":"2023-07-27T02:36:13Z","published":"2023-07-27T02:36:13Z","title":"FakeTracer: Proactively Defending Against Face-swap DeepFakes via\n  Implanting Traces in Training","summary":"  Face-swap DeepFake is an emerging AI-based face forgery technique that can\nreplace the original face in a video with a generated face of the target\nidentity while retaining consistent facial attributes such as expression and\norientation. Due to the high privacy of faces, the misuse of this technique can\nraise severe social concerns, drawing tremendous attention to defend against\nDeepFakes recently. In this paper, we describe a new proactive defense method\ncalled FakeTracer to expose face-swap DeepFakes via implanting traces in\ntraining. Compared to general face-synthesis DeepFake, the face-swap DeepFake\nis more complex as it involves identity change, is subjected to the\nencoding-decoding process, and is trained unsupervised, increasing the\ndifficulty of implanting traces into the training phase. To effectively defend\nagainst face-swap DeepFake, we design two types of traces, sustainable trace\n(STrace) and erasable trace (ETrace), to be added to training faces. During the\ntraining, these manipulated faces affect the learning of the face-swap DeepFake\nmodel, enabling it to generate faces that only contain sustainable traces. In\nlight of these two traces, our method can effectively expose DeepFakes by\nidentifying them. Extensive experiments are conducted on the Celeb-DF dataset,\ncompared with recent passive and proactive defense methods, and are studied\nthoroughly regarding various factors, corroborating the efficacy of our method\non defending against face-swap DeepFake.\n","authors":["Pu Sun","Honggang Qi","Yuezun Li","Siwei Lyu"],"pdf_url":"https://arxiv.org/pdf/2307.14593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14591v1","updated":"2023-07-27T02:30:12Z","published":"2023-07-27T02:30:12Z","title":"The detection and rectification for identity-switch based on unfalsified\n  control","summary":"  The purpose of multi-object tracking (MOT) is to continuously track and\nidentify objects detected in videos. Currently, most methods for multi-object\ntracking model the motion information and combine it with appearance\ninformation to determine and track objects. In this paper, unfalsified control\nis employed to address the ID-switch problem in multi-object tracking. We\nestablish sequences of appearance information variations for the trajectories\nduring the tracking process and design a detection and rectification module\nspecifically for ID-switch detection and recovery. We also propose a simple and\neffective strategy to address the issue of ambiguous matching of appearance\ninformation during the data association process. Experimental results on\npublicly available MOT datasets demonstrate that the tracker exhibits excellent\neffectiveness and robustness in handling tracking errors caused by occlusions\nand rapid movements.\n","authors":["Junchao Huang","Xiaoqi He","Sheng Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.14591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14588v1","updated":"2023-07-27T02:18:12Z","published":"2023-07-27T02:18:12Z","title":"MCPA: Multi-scale Cross Perceptron Attention Network for 2D Medical\n  Image Segmentation","summary":"  The UNet architecture, based on Convolutional Neural Networks (CNN), has\ndemonstrated its remarkable performance in medical image analysis. However, it\nfaces challenges in capturing long-range dependencies due to the limited\nreceptive fields and inherent bias of convolutional operations. Recently,\nnumerous transformer-based techniques have been incorporated into the UNet\narchitecture to overcome this limitation by effectively capturing global\nfeature correlations. However, the integration of the Transformer modules may\nresult in the loss of local contextual information during the global feature\nfusion process. To overcome these challenges, we propose a 2D medical image\nsegmentation model called Multi-scale Cross Perceptron Attention Network\n(MCPA). The MCPA consists of three main components: an encoder, a decoder, and\na Cross Perceptron. The Cross Perceptron first captures the local correlations\nusing multiple Multi-scale Cross Perceptron modules, facilitating the fusion of\nfeatures across scales. The resulting multi-scale feature vectors are then\nspatially unfolded, concatenated, and fed through a Global Perceptron module to\nmodel global dependencies. Furthermore, we introduce a Progressive Dual-branch\nStructure to address the semantic segmentation of the image involving finer\ntissue structures. This structure gradually shifts the segmentation focus of\nMCPA network training from large-scale structural features to more\nsophisticated pixel-level features. We evaluate our proposed MCPA model on\nseveral publicly available medical image datasets from different tasks and\ndevices, including the open large-scale dataset of CT (Synapse), MRI (ACDC),\nfundus camera (DRIVE, CHASE_DB1, HRF), and OCTA (ROSE). The experimental\nresults show that our MCPA model achieves state-of-the-art performance. The\ncode is available at\nhttps://github.com/simonustc/MCPA-for-2D-Medical-Image-Segmentation.\n","authors":["Liang Xu","Mingxiao Chen","Yi Cheng","Pengfei Shao","Shuwei Shen","Peng Yao","Ronald X. Xu"],"pdf_url":"https://arxiv.org/pdf/2307.14588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11220v4","updated":"2023-07-27T02:11:02Z","published":"2022-11-21T07:29:24Z","title":"STGlow: A Flow-based Generative Framework with Dual Graphormer for\n  Pedestrian Trajectory Prediction","summary":"  The pedestrian trajectory prediction task is an essential component of\nintelligent systems. Its applications include but are not limited to autonomous\ndriving, robot navigation, and anomaly detection of monitoring systems. Due to\nthe diversity of motion behaviors and the complex social interactions among\npedestrians, accurately forecasting their future trajectory is challenging.\nExisting approaches commonly adopt GANs or CVAEs to generate diverse\ntrajectories. However, GAN-based methods do not directly model data in a latent\nspace, which may make them fail to have full support over the underlying data\ndistribution; CVAE-based methods optimize a lower bound on the log-likelihood\nof observations, which may cause the learned distribution to deviate from the\nunderlying distribution. The above limitations make existing approaches often\ngenerate highly biased or inaccurate trajectories. In this paper, we propose a\nnovel generative flow based framework with dual graphormer for pedestrian\ntrajectory prediction (STGlow). Different from previous approaches, our method\ncan more precisely model the underlying data distribution by optimizing the\nexact log-likelihood of motion behaviors. Besides, our method has clear\nphysical meanings for simulating the evolution of human motion behaviors. The\nforward process of the flow gradually degrades complex motion behavior into\nsimple behavior, while its reverse process represents the evolution of simple\nbehavior into complex motion behavior. Further, we introduce a dual graphormer\ncombining with the graph structure to more adequately model the temporal\ndependencies and the mutual spatial interactions. Experimental results on\nseveral benchmarks demonstrate that our method achieves much better performance\ncompared to previous state-of-the-art approaches.\n","authors":["Rongqin Liang","Yuanman Li","Jiantao Zhou","Xia Li"],"pdf_url":"https://arxiv.org/pdf/2211.11220v4.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.14579v1","updated":"2023-07-27T01:57:06Z","published":"2023-07-27T01:57:06Z","title":"Neural Representation-Based Method for Metal-induced Artifact Reduction\n  in Dental CBCT Imaging","summary":"  This study introduces a novel reconstruction method for dental cone-beam\ncomputed tomography (CBCT), focusing on effectively reducing metal-induced\nartifacts commonly encountered in the presence of prevalent metallic implants.\nDespite significant progress in metal artifact reduction techniques, challenges\npersist owing to the intricate physical interactions between polychromatic\nX-ray beams and metal objects, which are further compounded by the additional\neffects associated with metal-tooth interactions and factors specific to the\ndental CBCT data environment. To overcome these limitations, we propose an\nimplicit neural network that generates two distinct and informative tomographic\nimages. One image represents the monochromatic attenuation distribution at a\nspecific energy level, whereas the other captures the nonlinear beam-hardening\nfactor resulting from the polychromatic nature of X-ray beams. In contrast to\nexisting CT reconstruction techniques, the proposed method relies exclusively\non the Beer--Lambert law, effectively preventing the generation of\nmetal-induced artifacts during the backprojection process commonly implemented\nin conventional methods. Extensive experimental evaluations demonstrate that\nthe proposed method effectively reduces metal artifacts while providing\nhigh-quality image reconstructions, thus emphasizing the significance of the\nsecond image in capturing the nonlinear beam-hardening factor.\n","authors":["Hyoung Suk Park","Kiwan Jeon","Jin Keun Seo"],"pdf_url":"https://arxiv.org/pdf/2307.14579v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.14578v1","updated":"2023-07-27T01:53:57Z","published":"2023-07-27T01:53:57Z","title":"GADER: GAit DEtection and Recognition in the Wild","summary":"  Gait recognition holds the promise of robustly identifying subjects based on\ntheir walking patterns instead of color information. While previous approaches\nhave performed well for curated indoor scenes, they have significantly impeded\napplicability in unconstrained situations, e.g. outdoor, long distance scenes.\nWe propose an end-to-end GAit DEtection and Recognition (GADER) algorithm for\nhuman authentication in challenging outdoor scenarios. Specifically, GADER\nleverages a Double Helical Signature to detect the fragment of human movement\nand incorporates a novel gait recognition method, which learns representations\nby distilling from an auxiliary RGB recognition model. At inference time, GADER\nonly uses the silhouette modality but benefits from a more robust\nrepresentation. Extensive experiments on indoor and outdoor datasets\ndemonstrate that the proposed method outperforms the State-of-The-Arts for gait\nrecognition and verification, with a significant 20.6% improvement on\nunconstrained, long distance scenes.\n","authors":["Yuxiang Guo","Cheng Peng","Ram Prabhakar","Chun Pong Lau","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2307.14578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10132v3","updated":"2023-07-27T01:45:26Z","published":"2023-05-17T11:26:43Z","title":"Automatic 3D Registration of Dental CBCT and Face Scan Data using 2D\n  Projection Images","summary":"  This paper presents a fully automatic registration method of dental cone-beam\ncomputed tomography (CBCT) and face scan data. It can be used for a digital\nplatform of 3D jaw-teeth-face models in a variety of applications, including 3D\ndigital treatment planning and orthognathic surgery. Difficulties in accurately\nmerging facial scans and CBCT images are due to the different image acquisition\nmethods and limited area of correspondence between the two facial surfaces. In\naddition, it is difficult to use machine learning techniques because they use\nface-related 3D medical data with radiation exposure, which are difficult to\nobtain for training. The proposed method addresses these problems by reusing an\nexisting machine-learning-based 2D landmark detection algorithm in an\nopen-source library and developing a novel mathematical algorithm that\nidentifies paired 3D landmarks from knowledge of the corresponding 2D\nlandmarks. A main contribution of this study is that the proposed method does\nnot require annotated training data of facial landmarks because it uses a\npre-trained facial landmark detection algorithm that is known to be robust and\ngeneralized to various 2D face image models. Note that this reduces a 3D\nlandmark detection problem to a 2D problem of identifying the corresponding\nlandmarks on two 2D projection images generated from two different projection\nangles. Here, the 3D landmarks for registration were selected from the\nsub-surfaces with the least geometric change under the CBCT and face scan\nenvironments. For the final fine-tuning of the registration, the Iterative\nClosest Point method was applied, which utilizes geometrical information around\nthe 3D landmarks. The experimental results show that the proposed method\nachieved an averaged surface distance error of 0.74 mm for three pairs of CBCT\nand face scan datasets.\n","authors":["Hyoung Suk Park","Chang Min Hyun","Sang-Hwy Lee","Jin Keun Seo","Kiwan Jeon"],"pdf_url":"https://arxiv.org/pdf/2305.10132v3.pdf","comment":"8 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.14575v1","updated":"2023-07-27T01:45:13Z","published":"2023-07-27T01:45:13Z","title":"A Memory-Augmented Multi-Task Collaborative Framework for Unsupervised\n  Traffic Accident Detection in Driving Videos","summary":"  Identifying traffic accidents in driving videos is crucial to ensuring the\nsafety of autonomous driving and driver assistance systems. To address the\npotential danger caused by the long-tailed distribution of driving events,\nexisting traffic accident detection (TAD) methods mainly rely on unsupervised\nlearning. However, TAD is still challenging due to the rapid movement of\ncameras and dynamic scenes in driving scenarios. Existing unsupervised TAD\nmethods mainly rely on a single pretext task, i.e., an appearance-based or\nfuture object localization task, to detect accidents. However, appearance-based\napproaches are easily disturbed by the rapid movement of the camera and changes\nin illumination, which significantly reduce the performance of traffic accident\ndetection. Methods based on future object localization may fail to capture\nappearance changes in video frames, making it difficult to detect ego-involved\naccidents (e.g., out of control of the ego-vehicle). In this paper, we propose\na novel memory-augmented multi-task collaborative framework (MAMTCF) for\nunsupervised traffic accident detection in driving videos. Different from\nprevious approaches, our method can more accurately detect both ego-involved\nand non-ego accidents by simultaneously modeling appearance changes and object\nmotions in video frames through the collaboration of optical flow\nreconstruction and future object localization tasks. Further, we introduce a\nmemory-augmented motion representation mechanism to fully explore the\ninterrelation between different types of motion representations and exploit the\nhigh-level features of normal traffic patterns stored in memory to augment\nmotion representations, thus enlarging the difference from anomalies.\nExperimental results on recently published large-scale dataset demonstrate that\nour method achieves better performance compared to previous state-of-the-art\napproaches.\n","authors":["Rongqin Liang","Yuanman Li","Yingxin Yi","Jiantao Zhou","Xia Li"],"pdf_url":"https://arxiv.org/pdf/2307.14575v1.pdf","comment":"12pages,5 figures"},{"id":"http://arxiv.org/abs/2207.11860v3","updated":"2023-07-27T01:44:44Z","published":"2022-07-25T00:42:38Z","title":"Behind Every Domain There is a Shift: Adapting Distortion-aware Vision\n  Transformers for Panoramic Semantic Segmentation","summary":"  In this paper, we address panoramic semantic segmentation which is\nunder-explored due to two critical challenges: (1) image distortions and object\ndeformations on panoramas; (2) lack of semantic annotations in the 360-degree\nimagery. To tackle these problems, first, we propose the upgraded Transformer\nfor Panoramic Semantic Segmentation, i.e., Trans4PASS+, equipped with\nDeformable Patch Embedding (DPE) and Deformable MLP (DMLPv2) modules for\nhandling object deformations and image distortions whenever (before or after\nadaptation) and wherever (shallow or deep levels). Second, we enhance the\nMutual Prototypical Adaptation (MPA) strategy via pseudo-label rectification\nfor unsupervised domain adaptive panoramic segmentation. Third, aside from\nPinhole-to-Panoramic (Pin2Pan) adaptation, we create a new dataset (SynPASS)\nwith 9,080 panoramic images, facilitating Synthetic-to-Real (Syn2Real)\nadaptation scheme in 360-degree imagery. Extensive experiments are conducted,\nwhich cover indoor and outdoor scenarios, and each of them is investigated with\nPin2Pan and Syn2Real regimens. Trans4PASS+ achieves state-of-the-art\nperformances on four domain adaptive panoramic semantic segmentation\nbenchmarks. Code is available at https://github.com/jamycheung/Trans4PASS.\n","authors":["Jiaming Zhang","Kailun Yang","Hao Shi","Simon Reiß","Kunyu Peng","Chaoxiang Ma","Haodong Fu","Philip H. S. Torr","Kaiwei Wang","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2207.11860v3.pdf","comment":"Extended version of CVPR 2022 paper arXiv:2203.01452. Code is\n  available at https://github.com/jamycheung/Trans4PASS"},{"id":"http://arxiv.org/abs/2307.14571v1","updated":"2023-07-27T01:20:47Z","published":"2023-07-27T01:20:47Z","title":"Robust Detection, Assocation, and Localization of Vehicle Lights: A\n  Context-Based Cascaded CNN Approach and Evaluations","summary":"  Vehicle light detection is required for important downstream safe autonomous\ndriving tasks, such as predicting a vehicle's light state to determine if the\nvehicle is making a lane change or turning. Currently, many vehicle light\ndetectors use single-stage detectors which predict bounding boxes to identify a\nvehicle light, in a manner decoupled from vehicle instances. In this paper, we\npresent a method for detecting a vehicle light given an upstream vehicle\ndetection and approximation of a visible light's center. Our method predicts\nfour approximate corners associated with each vehicle light. We experiment with\nCNN architectures, data augmentation, and contextual preprocessing methods\ndesigned to reduce surrounding-vehicle confusion. We achieve an average\ndistance error from the ground truth corner of 5.09 pixels, about 17.24% of the\nsize of the vehicle light on average. We train and evaluate our model on the\nLISA Lights dataset, allowing us to thoroughly evaluate our vehicle light\ncorner detection model on a large variety of vehicle light shapes and lighting\nconditions. We propose that this model can be integrated into a pipeline with\nvehicle detection and vehicle light center detection to make a fully-formed\nvehicle light detection network, valuable to identifying trajectory-informative\nsignals in driving scenes.\n","authors":["Akshay Gopalkrishnan","Ross Greer","Maitrayee Keskar","Mohan Trivedi"],"pdf_url":"https://arxiv.org/pdf/2307.14571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14570v1","updated":"2023-07-27T01:07:15Z","published":"2023-07-27T01:07:15Z","title":"Physically Plausible 3D Human-Scene Reconstruction from Monocular RGB\n  Image using an Adversarial Learning Approach","summary":"  Holistic 3D human-scene reconstruction is a crucial and emerging research\narea in robot perception. A key challenge in holistic 3D human-scene\nreconstruction is to generate a physically plausible 3D scene from a single\nmonocular RGB image. The existing research mainly proposes optimization-based\napproaches for reconstructing the scene from a sequence of RGB frames with\nexplicitly defined physical laws and constraints between different scene\nelements (humans and objects). However, it is hard to explicitly define and\nmodel every physical law in every scenario. This paper proposes using an\nimplicit feature representation of the scene elements to distinguish a\nphysically plausible alignment of humans and objects from an implausible one.\nWe propose using a graph-based holistic representation with an encoded physical\nrepresentation of the scene to analyze the human-object and object-object\ninteractions within the scene. Using this graphical representation, we\nadversarially train our model to learn the feasible alignments of the scene\nelements from the training data itself without explicitly defining the laws and\nconstraints between them. Unlike the existing inference-time optimization-based\napproaches, we use this adversarially trained model to produce a per-frame 3D\nreconstruction of the scene that abides by the physical laws and constraints.\nOur learning-based method achieves comparable 3D reconstruction quality to\nexisting optimization-based holistic human-scene reconstruction methods and\ndoes not need inference time optimization. This makes it better suited when\ncompared to existing methods, for potential use in robotic applications, such\nas robot navigation, etc.\n","authors":["Sandika Biswas","Kejie Li","Biplab Banerjee","Subhasis Chaudhuri","Hamid Rezatofighi"],"pdf_url":"https://arxiv.org/pdf/2307.14570v1.pdf","comment":"Accepted in RAL 2023"},{"id":"http://arxiv.org/abs/2307.13925v2","updated":"2023-07-27T00:55:42Z","published":"2023-07-26T02:46:50Z","title":"EasyNet: An Easy Network for 3D Industrial Anomaly Detection","summary":"  3D anomaly detection is an emerging and vital computer vision task in\nindustrial manufacturing (IM). Recently many advanced algorithms have been\npublished, but most of them cannot meet the needs of IM. There are several\ndisadvantages: i) difficult to deploy on production lines since their\nalgorithms heavily rely on large pre-trained models; ii) hugely increase\nstorage overhead due to overuse of memory banks; iii) the inference speed\ncannot be achieved in real-time. To overcome these issues, we propose an easy\nand deployment-friendly network (called EasyNet) without using pre-trained\nmodels and memory banks: firstly, we design a multi-scale multi-modality\nfeature encoder-decoder to accurately reconstruct the segmentation maps of\nanomalous regions and encourage the interaction between RGB images and depth\nimages; secondly, we adopt a multi-modality anomaly segmentation network to\nachieve a precise anomaly map; thirdly, we propose an attention-based\ninformation entropy fusion module for feature fusion during inference, making\nit suitable for real-time deployment. Extensive experiments show that EasyNet\nachieves an anomaly detection AUROC of 92.6% without using pre-trained models\nand memory banks. In addition, EasyNet is faster than existing methods, with a\nhigh frame rate of 94.55 FPS on a Tesla V100 GPU.\n","authors":["Ruitao Chen","Guoyang Xie","Jiaqi Liu","Jinbao Wang","Ziqi Luo","Jinfan Wang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.13925v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13957v2","updated":"2023-07-27T00:53:11Z","published":"2023-07-26T04:33:05Z","title":"Heterogeneous Embodied Multi-Agent Collaboration","summary":"  Multi-agent embodied tasks have recently been studied in complex indoor\nvisual environments. Collaboration among multiple agents can improve work\nefficiency and has significant practical value. However, most of the existing\nresearch focuses on homogeneous multi-agent tasks. Compared with homogeneous\nagents, heterogeneous agents can leverage their different capabilities to\nallocate corresponding sub-tasks and cooperate to complete complex tasks.\nHeterogeneous multi-agent tasks are common in real-world scenarios, and the\ncollaboration strategy among heterogeneous agents is a challenging and\nimportant problem to be solved. To study collaboration among heterogeneous\nagents, we propose the heterogeneous multi-agent tidying-up task, in which\nmultiple heterogeneous agents with different capabilities collaborate with each\nother to detect misplaced objects and place them in reasonable locations. This\nis a demanding task since it requires agents to make the best use of their\ndifferent capabilities to conduct reasonable task planning and complete the\nwhole task. To solve this task, we build a heterogeneous multi-agent tidying-up\nbenchmark dataset in a large number of houses with multiple rooms based on\nProcTHOR-10K. We propose the hierarchical decision model based on misplaced\nobject detection, reasonable receptacle prediction, as well as the\nhandshake-based group communication mechanism. Extensive experiments are\nconducted to demonstrate the effectiveness of the proposed model. The project's\nwebsite and videos of experiments can be found at https://hetercol.github.io/.\n","authors":["Xinzhu Liu","Di Guo","Huaping Liu"],"pdf_url":"https://arxiv.org/pdf/2307.13957v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08720v2","updated":"2023-07-27T00:09:30Z","published":"2023-02-17T06:29:12Z","title":"Algorithmic Hallucinations of Near-Surface Winds: Statistical\n  Downscaling with Generative Adversarial Networks to Convection-Permitting\n  Scales","summary":"  This paper explores the application of emerging machine learning methods from\nimage super-resolution (SR) to the task of statistical downscaling. We\nspecifically focus on convolutional neural network-based Generative Adversarial\nNetworks (GANs). Our GANs are conditioned on low-resolution (LR) inputs to\ngenerate high-resolution (HR) surface winds emulating Weather Research and\nForecasting (WRF) model simulations over North America. Unlike traditional SR\nmodels, where LR inputs are idealized coarsened versions of the HR images, WRF\nemulation involves using non-idealized LR and HR pairs resulting in\nshared-scale mismatches due to internal variability. Our study builds upon\ncurrent SR-based statistical downscaling by experimenting with a novel\nfrequency-separation (FS) approach from the computer vision field. To assess\nthe skill of SR models, we carefully select evaluation metrics, and focus on\nperformance measures based on spatial power spectra. Our analyses reveal how\nGAN configurations influence spatial structures in the generated fields,\nparticularly biases in spatial variability spectra. Using power spectra to\nevaluate the FS experiments reveals that successful applications of FS in\ncomputer vision do not translate to climate fields. However, the FS experiments\ndemonstrate the sensitivity of power spectra to a commonly used GAN-based SR\nobjective function, which helps interpret and understand its role in\ndetermining spatial structures. This result motivates the development of a\nnovel partial frequency-separation scheme as a promising configuration option.\nWe also quantify the influence on GAN performance of non-idealized LR fields\nresulting from internal variability. Furthermore, we conduct a spectra-based\nfeature-importance experiment allowing us to explore the dependence of the\nspatial structure of generated fields on different physically relevant LR\ncovariates.\n","authors":["Nicolaas J. Annau","Alex J. Cannon","Adam H. Monahan"],"pdf_url":"https://arxiv.org/pdf/2302.08720v2.pdf","comment":"43 pages, including 11 main figures, and 16 supplemental figures"},{"id":"http://arxiv.org/abs/2302.12247v3","updated":"2023-07-27T23:23:12Z","published":"2023-02-23T18:59:05Z","title":"Quantifying & Modeling Multimodal Interactions: An Information\n  Decomposition Framework","summary":"  The recent explosion of interest in multimodal applications has resulted in a\nwide selection of datasets and methods for representing and integrating\ninformation from different modalities. Despite these empirical advances, there\nremain fundamental research questions: How can we quantify the interactions\nthat are necessary to solve a multimodal task? Subsequently, what are the most\nsuitable multimodal models to capture these interactions? To answer these\nquestions, we propose an information-theoretic approach to quantify the degree\nof redundancy, uniqueness, and synergy relating input modalities with an output\ntask. We term these three measures as the PID statistics of a multimodal\ndistribution (or PID for short), and introduce two new estimators for these PID\nstatistics that scale to high-dimensional distributions. To validate PID\nestimation, we conduct extensive experiments on both synthetic datasets where\nthe PID is known and on large-scale multimodal benchmarks where PID estimations\nare compared with human annotations. Finally, we demonstrate their usefulness\nin (1) quantifying interactions within multimodal datasets, (2) quantifying\ninteractions captured by multimodal models, (3) principled approaches for model\nselection, and (4) three real-world case studies engaging with domain experts\nin pathology, mood prediction, and robotic perception where our framework helps\nto recommend strong multimodal models for each application.\n","authors":["Paul Pu Liang","Yun Cheng","Xiang Fan","Chun Kai Ling","Suzanne Nie","Richard Chen","Zihao Deng","Nicholas Allen","Randy Auerbach","Faisal Mahmood","Ruslan Salakhutdinov","Louis-Philippe Morency"],"pdf_url":"https://arxiv.org/pdf/2302.12247v3.pdf","comment":"Code available at: https://github.com/pliang279/PID"},{"id":"http://arxiv.org/abs/2307.15230v1","updated":"2023-07-27T23:18:20Z","published":"2023-07-27T23:18:20Z","title":"Fast Dust Sand Image Enhancement Based on Color Correction and New\n  Membership Function","summary":"  Images captured in dusty environments suffering from poor visibility and\nquality. Enhancement of these images such as sand dust images plays a critical\nrole in various atmospheric optics applications. In this work, proposed a new\nmodel based on Color Correction and new membership function to enhance san dust\nimages. The proposed model consists of three phases: correction of color shift,\nremoval of haze, and enhancement of contrast and brightness. The color shift is\ncorrected using a new membership function to adjust the values of U and V in\nthe YUV color space. The Adaptive Dark Channel Prior (A-DCP) is used for haze\nremoval. The stretching contrast and improving image brightness are based on\nContrast Limited Adaptive Histogram Equalization (CLAHE). The proposed model\ntests and evaluates through many real sand dust images. The experimental\nresults show that the proposed solution is outperformed the current studies in\nterms of effectively removing the red and yellow cast and provides high quality\nand quantity dust images.\n","authors":["Ali Hakem Alsaeedi","Suha Mohammed Hadi","Yarub Alazzawi"],"pdf_url":"https://arxiv.org/pdf/2307.15230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15220v1","updated":"2023-07-27T22:38:12Z","published":"2023-07-27T22:38:12Z","title":"Learning Multi-modal Representations by Watching Hundreds of Surgical\n  Video Lectures","summary":"  Recent advancements in surgical computer vision applications have been driven\nby fully-supervised methods, primarily using only visual data. These methods\nrely on manually annotated surgical videos to predict a fixed set of object\ncategories, limiting their generalizability to unseen surgical procedures and\ndownstream tasks. In this work, we put forward the idea that the surgical video\nlectures available through open surgical e-learning platforms can provide\neffective supervisory signals for multi-modal representation learning without\nrelying on manual annotations. We address the surgery-specific linguistic\nchallenges present in surgical video lectures by employing multiple\ncomplementary automatic speech recognition systems to generate text\ntranscriptions. We then present a novel method, SurgVLP - Surgical Vision\nLanguage Pre-training, for multi-modal representation learning. SurgVLP\nconstructs a new contrastive learning objective to align video clip embeddings\nwith the corresponding multiple text embeddings by bringing them together\nwithin a joint latent space. To effectively show the representation capability\nof the learned joint latent space, we introduce several vision-and-language\ntasks for surgery, such as text-based video retrieval, temporal activity\ngrounding, and video captioning, as benchmarks for evaluation. We further\ndemonstrate that without using any labeled ground truth, our approach can be\nemployed for traditional vision-only surgical downstream tasks, such as\nsurgical tool, phase, and triplet recognition. The code will be made available\nat https://github.com/CAMMA-public/SurgVLP\n","authors":["Kun Yuan","Vinkle Srivastav","Tong Yu","Joel Lavanchy","Pietro Mascagni","Nassir Navab","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2307.15220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09283v3","updated":"2023-07-27T22:35:17Z","published":"2023-07-18T14:24:33Z","title":"RepViT: Revisiting Mobile CNN From ViT Perspective","summary":"  Recently, lightweight Vision Transformers (ViTs) demonstrate superior\nperformance and lower latency compared with lightweight Convolutional Neural\nNetworks (CNNs) on resource-constrained mobile devices. This improvement is\nusually attributed to the multi-head self-attention module, which enables the\nmodel to learn global representations. However, the architectural disparities\nbetween lightweight ViTs and lightweight CNNs have not been adequately\nexamined. In this study, we revisit the efficient design of lightweight CNNs\nand emphasize their potential for mobile devices. We incrementally enhance the\nmobile-friendliness of a standard lightweight CNN, specifically MobileNetV3, by\nintegrating the efficient architectural choices of lightweight ViTs. This ends\nup with a new family of pure lightweight CNNs, namely RepViT. Extensive\nexperiments show that RepViT outperforms existing state-of-the-art lightweight\nViTs and exhibits favorable latency in various vision tasks. On ImageNet,\nRepViT achieves over 80\\% top-1 accuracy with nearly 1ms latency on an iPhone\n12, which is the first time for a lightweight model, to the best of our\nknowledge. Our largest model, RepViT-M3, obtains 81.4\\% accuracy with only\n1.3ms latency. The code and trained models are available at\n\\url{https://github.com/jameslahm/RepViT}.\n","authors":["Ao Wang","Hui Chen","Zijia Lin","Hengjun Pu","Guiguang Ding"],"pdf_url":"https://arxiv.org/pdf/2307.09283v3.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.15208v1","updated":"2023-07-27T21:58:26Z","published":"2023-07-27T21:58:26Z","title":"Generative AI for Medical Imaging: extending the MONAI Framework","summary":"  Recent advances in generative AI have brought incredible breakthroughs in\nseveral areas, including medical imaging. These generative models have\ntremendous potential not only to help safely share medical data via synthetic\ndatasets but also to perform an array of diverse applications, such as anomaly\ndetection, image-to-image translation, denoising, and MRI reconstruction.\nHowever, due to the complexity of these models, their implementation and\nreproducibility can be difficult. This complexity can hinder progress, act as a\nuse barrier, and dissuade the comparison of new methods with existing works. In\nthis study, we present MONAI Generative Models, a freely available open-source\nplatform that allows researchers and developers to easily train, evaluate, and\ndeploy generative models and related applications. Our platform reproduces\nstate-of-art studies in a standardised way involving different architectures\n(such as diffusion models, autoregressive transformers, and GANs), and provides\npre-trained models for the community. We have implemented these models in a\ngeneralisable fashion, illustrating that their results can be extended to 2D or\n3D scenarios, including medical images with different modalities (like CT, MRI,\nand X-Ray data) and from different anatomical areas. Finally, we adopt a\nmodular and extensible approach, ensuring long-term maintainability and the\nextension of current applications for future features.\n","authors":["Walter H. L. Pinaya","Mark S. Graham","Eric Kerfoot","Petru-Daniel Tudosiu","Jessica Dafflon","Virginia Fernandez","Pedro Sanchez","Julia Wolleb","Pedro F. da Costa","Ashay Patel","Hyungjin Chung","Can Zhao","Wei Peng","Zelong Liu","Xueyan Mei","Oeslle Lucena","Jong Chul Ye","Sotirios A. Tsaftaris","Prerna Dogra","Andrew Feng","Marc Modat","Parashkev Nachev","Sebastien Ourselin","M. Jorge Cardoso"],"pdf_url":"https://arxiv.org/pdf/2307.15208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.06350v3","updated":"2023-07-27T21:48:30Z","published":"2023-02-13T13:34:29Z","title":"VITR: Augmenting Vision Transformers with Relation-Focused Learning for\n  Cross-Modal Information Retrieval","summary":"  The relations expressed in user queries are vital for cross-modal information\nretrieval. Relation-focused cross-modal retrieval aims to retrieve information\nthat corresponds to these relations, enabling effective retrieval across\ndifferent modalities. Pre-trained networks, such as Contrastive Language-Image\nPre-training (CLIP), have gained significant attention and acclaim for their\nexceptional performance in various cross-modal learning tasks. However, the\nVision Transformer (ViT) used in these networks is limited in its ability to\nfocus on image region relations. Specifically, ViT is trained to match images\nwith relevant descriptions at the global level, without considering the\nalignment between image regions and descriptions. This paper introduces VITR, a\nnovel network that enhances ViT by extracting and reasoning about image region\nrelations based on a local encoder. VITR is comprised of two key components.\nFirstly, it extends the capabilities of ViT-based cross-modal networks by\nenabling them to extract and reason with region relations present in images.\nSecondly, VITR incorporates a fusion module that combines the reasoned results\nwith global knowledge to predict similarity scores between images and\ndescriptions. The proposed VITR network was evaluated through experiments on\nthe tasks of relation-focused cross-modal information retrieval. The results\nderived from the analysis of the RefCOCOg, CLEVR, and Flickr30K datasets\ndemonstrated that the proposed VITR network consistently outperforms\nstate-of-the-art networks in image-to-text and text-to-image retrieval.\n","authors":["Yan Gong","Georgina Cosma","Axel Finke"],"pdf_url":"https://arxiv.org/pdf/2302.06350v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15199v1","updated":"2023-07-27T21:14:46Z","published":"2023-07-27T21:14:46Z","title":"PromptStyler: Prompt-driven Style Generation for Source-free Domain\n  Generalization","summary":"  In a joint vision-language space, a text feature (e.g., from \"a photo of a\ndog\") could effectively represent its relevant image features (e.g., from dog\nphotos). Inspired by this, we propose PromptStyler which simulates various\ndistribution shifts in the joint space by synthesizing diverse styles via\nprompts without using any images to deal with source-free domain\ngeneralization. Our method learns to generate a variety of style features (from\n\"a S* style of a\") via learnable style word vectors for pseudo-words S*. To\nensure that learned styles do not distort content information, we force\nstyle-content features (from \"a S* style of a [class]\") to be located nearby\ntheir corresponding content features (from \"[class]\") in the joint\nvision-language space. After learning style word vectors, we train a linear\nclassifier using synthesized style-content features. PromptStyler achieves the\nstate of the art on PACS, VLCS, OfficeHome and DomainNet, although it does not\nrequire any images and takes just ~30 minutes for training using a single GPU.\n","authors":["Junhyeong Cho","Gilhyun Nam","Sungyeon Kim","Hunmin Yang","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2307.15199v1.pdf","comment":"Accepted to ICCV 2023, Project Page: https://promptstyler.github.io/"},{"id":"http://arxiv.org/abs/2307.15198v1","updated":"2023-07-27T21:14:40Z","published":"2023-07-27T21:14:40Z","title":"One-shot Joint Extraction, Registration and Segmentation of Neuroimaging\n  Data","summary":"  Brain extraction, registration and segmentation are indispensable\npreprocessing steps in neuroimaging studies. The aim is to extract the brain\nfrom raw imaging scans (i.e., extraction step), align it with a target brain\nimage (i.e., registration step) and label the anatomical brain regions (i.e.,\nsegmentation step). Conventional studies typically focus on developing separate\nmethods for the extraction, registration and segmentation tasks in a supervised\nsetting. The performance of these methods is largely contingent on the quantity\nof training samples and the extent of visual inspections carried out by experts\nfor error correction. Nevertheless, collecting voxel-level labels and\nperforming manual quality control on high-dimensional neuroimages (e.g., 3D\nMRI) are expensive and time-consuming in many medical studies. In this paper,\nwe study the problem of one-shot joint extraction, registration and\nsegmentation in neuroimaging data, which exploits only one labeled template\nimage (a.k.a. atlas) and a few unlabeled raw images for training. We propose a\nunified end-to-end framework, called JERS, to jointly optimize the extraction,\nregistration and segmentation tasks, allowing feedback among them.\nSpecifically, we use a group of extraction, registration and segmentation\nmodules to learn the extraction mask, transformation and segmentation mask,\nwhere modules are interconnected and mutually reinforced by self-supervision.\nEmpirical results on real-world datasets demonstrate that our proposed method\nperforms exceptionally in the extraction, registration and segmentation tasks.\nOur code and data can be found at https://github.com/Anonymous4545/JERS\n","authors":["Yao Su","Zhentian Qian","Lei Ma","Lifang He","Xiangnan Kong"],"pdf_url":"https://arxiv.org/pdf/2307.15198v1.pdf","comment":"Published as a research track paper at KDD 2023. Code:\n  https://github.com/Anonymous4545/JERS"},{"id":"http://arxiv.org/abs/2307.15191v1","updated":"2023-07-27T20:44:44Z","published":"2023-07-27T20:44:44Z","title":"Small, but important: Traffic light proposals for detecting small\n  traffic lights and beyond","summary":"  Traffic light detection is a challenging problem in the context of\nself-driving cars and driver assistance systems. While most existing systems\nproduce good results on large traffic lights, detecting small and tiny ones is\noften overlooked. A key problem here is the inherent downsampling in CNNs,\nleading to low-resolution features for detection. To mitigate this problem, we\npropose a new traffic light detection system, comprising a novel traffic light\nproposal generator that utilizes findings from general object proposal\ngeneration, fine-grained multi-scale features, and attention for efficient\nprocessing. Moreover, we design a new detection head for classifying and\nrefining our proposals. We evaluate our system on three challenging, publicly\navailable datasets and compare it against six methods. The results show\nsubstantial improvements of at least $12.6\\%$ on small and tiny traffic lights,\nas well as strong results across all sizes of traffic lights.\n","authors":["Tom Sanitz","Christian Wilms","Simone Frintrop"],"pdf_url":"https://arxiv.org/pdf/2307.15191v1.pdf","comment":"Accepted at ICVS 2023"},{"id":"http://arxiv.org/abs/2307.15189v1","updated":"2023-07-27T20:36:02Z","published":"2023-07-27T20:36:02Z","title":"Med-Flamingo: a Multimodal Medical Few-shot Learner","summary":"  Medicine, by its nature, is a multifaceted domain that requires the synthesis\nof information across various modalities. Medical generative vision-language\nmodels (VLMs) make a first step in this direction and promise many exciting\nclinical applications. However, existing models typically have to be fine-tuned\non sizeable down-stream datasets, which poses a significant limitation as in\nmany medical applications data is scarce, necessitating models that are capable\nof learning from few examples in real-time. Here we propose Med-Flamingo, a\nmultimodal few-shot learner adapted to the medical domain. Based on\nOpenFlamingo-9B, we continue pre-training on paired and interleaved medical\nimage-text data from publications and textbooks. Med-Flamingo unlocks few-shot\ngenerative medical visual question answering (VQA) abilities, which we evaluate\non several datasets including a novel challenging open-ended VQA dataset of\nvisual USMLE-style problems. Furthermore, we conduct the first human evaluation\nfor generative medical VQA where physicians review the problems and blinded\ngenerations in an interactive app. Med-Flamingo improves performance in\ngenerative medical VQA by up to 20\\% in clinician's rating and firstly enables\nmultimodal medical few-shot adaptations, such as rationale generation. We\nrelease our model, code, and evaluation app under\nhttps://github.com/snap-stanford/med-flamingo.\n","authors":["Michael Moor","Qian Huang","Shirley Wu","Michihiro Yasunaga","Cyril Zakka","Yash Dalmia","Eduardo Pontes Reis","Pranav Rajpurkar","Jure Leskovec"],"pdf_url":"https://arxiv.org/pdf/2307.15189v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2307.15180v1","updated":"2023-07-27T20:19:11Z","published":"2023-07-27T20:19:11Z","title":"EnSolver: Uncertainty-Aware CAPTCHA Solver Using Deep Ensembles","summary":"  The popularity of text-based CAPTCHA as a security mechanism to protect\nwebsites from automated bots has prompted researches in CAPTCHA solvers, with\nthe aim of understanding its failure cases and subsequently making CAPTCHAs\nmore secure. Recently proposed solvers, built on advances in deep learning, are\nable to crack even the very challenging CAPTCHAs with high accuracy. However,\nthese solvers often perform poorly on out-of-distribution samples that contain\nvisual features different from those in the training set. Furthermore, they\nlack the ability to detect and avoid such samples, making them susceptible to\nbeing locked out by defense systems after a certain number of failed attempts.\nIn this paper, we propose EnSolver, a novel CAPTCHA solver that utilizes deep\nensemble uncertainty estimation to detect and skip out-of-distribution\nCAPTCHAs, making it harder to be detected. We demonstrate the use of our solver\nwith object detection models and show empirically that it performs well on both\nin-distribution and out-of-distribution data, achieving up to 98.1% accuracy\nwhen detecting out-of-distribution data and up to 93% success rate when solving\nin-distribution CAPTCHAs.\n","authors":["Duc C. Hoang","Cuong V. Nguyen","Amin Kharraz"],"pdf_url":"https://arxiv.org/pdf/2307.15180v1.pdf","comment":"Epistemic Uncertainty - E-pi UAI 2023 Workshop"},{"id":"http://arxiv.org/abs/2302.10730v2","updated":"2023-07-27T19:29:15Z","published":"2023-02-21T15:28:42Z","title":"Depth Estimation and Image Restoration by Deep Learning from Defocused\n  Images","summary":"  Monocular depth estimation and image deblurring are two fundamental tasks in\ncomputer vision, given their crucial role in understanding 3D scenes.\nPerforming any of them by relying on a single image is an ill-posed problem.\nThe recent advances in the field of Deep Convolutional Neural Networks (DNNs)\nhave revolutionized many tasks in computer vision, including depth estimation\nand image deblurring. When it comes to using defocused images, the depth\nestimation and the recovery of the All-in-Focus (Aif) image become related\nproblems due to defocus physics. Despite this, most of the existing models\ntreat them separately. There are, however, recent models that solve these\nproblems simultaneously by concatenating two networks in a sequence to first\nestimate the depth or defocus map and then reconstruct the focused image based\non it. We propose a DNN that solves the depth estimation and image deblurring\nin parallel. Our Two-headed Depth Estimation and Deblurring Network (2HDED:NET)\nextends a conventional Depth from Defocus (DFD) networks with a deblurring\nbranch that shares the same encoder as the depth branch. The proposed method\nhas been successfully tested on two benchmarks, one for indoor and the other\nfor outdoor scenes: NYU-v2 and Make3D. Extensive experiments with 2HDED:NET on\nthese benchmarks have demonstrated superior or close performances to those of\nthe state-of-the-art models for depth estimation and image deblurring.\n","authors":["Saqib Nazir","Lorenzo Vaquero","Manuel Mucientes","Víctor M. Brea","Daniela Coltuc"],"pdf_url":"https://arxiv.org/pdf/2302.10730v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.10390v4","updated":"2023-07-27T19:21:56Z","published":"2021-03-18T17:18:48Z","title":"Challenges of 3D Surface Reconstruction in Capsule Endoscopy","summary":"  Essential for improving the accuracy and reliability of bowel cancer\nscreening, three-dimensional (3D) surface reconstruction using capsule\nendoscopy (CE) images remains challenging due to CE hardware and software\nlimitations. This report generally focuses on challenges associated with 3D\nvisualization and specifically investigates the impact of the indeterminate\nselection of the angle of the line of sight on 3D surfaces. Furthermore, it\ndemonstrates that impact through 3D surfaces viewed at the same azimuth angles\nand different elevation angles of the line of sight. The report concludes that\n3D printing of reconstructed 3D surfaces can potentially overcome line of sight\nindeterminate selection and 2D screen visual restriction-related errors.\n","authors":["Olivier Rukundo"],"pdf_url":"https://arxiv.org/pdf/2103.10390v4.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.15157v1","updated":"2023-07-27T19:11:31Z","published":"2023-07-27T19:11:31Z","title":"R-LPIPS: An Adversarially Robust Perceptual Similarity Metric","summary":"  Similarity metrics have played a significant role in computer vision to\ncapture the underlying semantics of images. In recent years, advanced\nsimilarity metrics, such as the Learned Perceptual Image Patch Similarity\n(LPIPS), have emerged. These metrics leverage deep features extracted from\ntrained neural networks and have demonstrated a remarkable ability to closely\nalign with human perception when evaluating relative image similarity. However,\nit is now well-known that neural networks are susceptible to adversarial\nexamples, i.e., small perturbations invisible to humans crafted to deliberately\nmislead the model. Consequently, the LPIPS metric is also sensitive to such\nadversarial examples. This susceptibility introduces significant security\nconcerns, especially considering the widespread adoption of LPIPS in\nlarge-scale applications. In this paper, we propose the Robust Learned\nPerceptual Image Patch Similarity (R-LPIPS) metric, a new metric that leverages\nadversarially trained deep features. Through a comprehensive set of\nexperiments, we demonstrate the superiority of R-LPIPS compared to the\nclassical LPIPS metric. The code is available at\n\\url{https://github.com/SaraGhazanfari/R-LPIPS}.\n","authors":["Sara Ghazanfari","Siddharth Garg","Prashanth Krishnamurthy","Farshad Khorrami","Alexandre Araujo"],"pdf_url":"https://arxiv.org/pdf/2307.15157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15150v1","updated":"2023-07-27T18:53:14Z","published":"2023-07-27T18:53:14Z","title":"R-Block: Regularized Block of Dropout for convolutional networks","summary":"  Dropout as a regularization technique is widely used in fully connected\nlayers while is less effective in convolutional layers. Therefore more\nstructured forms of dropout have been proposed to regularize convolutional\nnetworks. The disadvantage of these methods is that the randomness introduced\ncauses inconsistency between training and inference. In this paper, we apply a\nmutual learning training strategy for convolutional layer regularization,\nnamely R-Block, which forces two outputs of the generated difference maximizing\nsub models to be consistent with each other. Concretely, R-Block minimizes the\nlosses between the output distributions of two sub models with different drop\nregions for each sample in the training dataset. We design two approaches to\nconstruct such sub models. Our experiments demonstrate that R-Block achieves\nbetter performance than other existing structured dropout variants. We also\ndemonstrate that our approaches to construct sub models outperforms others.\n","authors":["Liqi Wang","Qiya Hu"],"pdf_url":"https://arxiv.org/pdf/2307.15150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12239v2","updated":"2023-07-27T18:46:42Z","published":"2023-07-23T06:26:27Z","title":"Learning Dynamic Query Combinations for Transformer-based Object\n  Detection and Segmentation","summary":"  Transformer-based detection and segmentation methods use a list of learned\ndetection queries to retrieve information from the transformer network and\nlearn to predict the location and category of one specific object from each\nquery. We empirically find that random convex combinations of the learned\nqueries are still good for the corresponding models. We then propose to learn a\nconvex combination with dynamic coefficients based on the high-level semantics\nof the image. The generated dynamic queries, named modulated queries, better\ncapture the prior of object locations and categories in the different images.\nEquipped with our modulated queries, a wide range of DETR-based models achieve\nconsistent and superior performance across multiple tasks including object\ndetection, instance segmentation, panoptic segmentation, and video instance\nsegmentation.\n","authors":["Yiming Cui","Linjie Yang","Haichao Yu"],"pdf_url":"https://arxiv.org/pdf/2307.12239v2.pdf","comment":"12 pages, 4 figures, ICML 2023, code is available at\n  https://github.com/bytedance/DQ-Det"},{"id":"http://arxiv.org/abs/2307.15139v1","updated":"2023-07-27T18:31:04Z","published":"2023-07-27T18:31:04Z","title":"Online Clustered Codebook","summary":"  Vector Quantisation (VQ) is experiencing a comeback in machine learning,\nwhere it is increasingly used in representation learning. However, optimizing\nthe codevectors in existing VQ-VAE is not entirely trivial. A problem is\ncodebook collapse, where only a small subset of codevectors receive gradients\nuseful for their optimisation, whereas a majority of them simply ``dies off''\nand is never updated or used. This limits the effectiveness of VQ for learning\nlarger codebooks in complex computer vision tasks that require high-capacity\nrepresentations. In this paper, we present a simple alternative method for\nonline codebook learning, Clustering VQ-VAE (CVQ-VAE). Our approach selects\nencoded features as anchors to update the ``dead'' codevectors, while\noptimising the codebooks which are alive via the original loss. This strategy\nbrings unused codevectors closer in distribution to the encoded features,\nincreasing the likelihood of being chosen and optimized. We extensively\nvalidate the generalization capability of our quantiser on various datasets,\ntasks (e.g. reconstruction and generation), and architectures (e.g. VQ-VAE,\nVQGAN, LDM). Our CVQ-VAE can be easily integrated into the existing models with\njust a few lines of code.\n","authors":["Chuanxia Zheng","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2307.15139v1.pdf","comment":"The project page: https://chuanxiaz.com/cvq/"},{"id":"http://arxiv.org/abs/2307.15131v1","updated":"2023-07-27T18:08:19Z","published":"2023-07-27T18:08:19Z","title":"Seal-3D: Interactive Pixel-Level Editing for Neural Radiance Fields","summary":"  With the popularity of implicit neural representations, or neural radiance\nfields (NeRF), there is a pressing need for editing methods to interact with\nthe implicit 3D models for tasks like post-processing reconstructed scenes and\n3D content creation. While previous works have explored NeRF editing from\nvarious perspectives, they are restricted in editing flexibility, quality, and\nspeed, failing to offer direct editing response and instant preview. The key\nchallenge is to conceive a locally editable neural representation that can\ndirectly reflect the editing instructions and update instantly. To bridge the\ngap, we propose a new interactive editing method and system for implicit\nrepresentations, called Seal-3D, which allows users to edit NeRF models in a\npixel-level and free manner with a wide range of NeRF-like backbone and preview\nthe editing effects instantly. To achieve the effects, the challenges are\naddressed by our proposed proxy function mapping the editing instructions to\nthe original space of NeRF models and a teacher-student training strategy with\nlocal pretraining and global finetuning. A NeRF editing system is built to\nshowcase various editing types. Our system can achieve compelling editing\neffects with an interactive speed of about 1 second.\n","authors":["Xiangyu Wang","Jingsen Zhu","Qi Ye","Yuchi Huo","Yunlong Ran","Zhihua Zhong","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2307.15131v1.pdf","comment":"Accepted by ICCV2023. Project Page:\n  https://windingwind.github.io/seal-3d/ Code:\n  https://github.com/windingwind/seal-3d/"},{"id":"http://arxiv.org/abs/2307.09233v2","updated":"2023-07-27T18:05:23Z","published":"2023-07-18T13:10:11Z","title":"Augmenting CLIP with Improved Visio-Linguistic Reasoning","summary":"  Image-text contrastive models such as CLIP are useful for a variety of\ndownstream applications including zero-shot classification, image-text\nretrieval and transfer learning. However, these contrastively trained\nvision-language models often fail on compositional visio-linguistic tasks such\nas Winoground with performance equivalent to random chance. In our paper, we\naddress this issue and propose a sample-efficient light-weight method called\nSDS-CLIP to improve the compositional visio-linguistic reasoning capabilities\nof CLIP. The core idea of our method is to use differentiable image\nparameterizations to fine-tune CLIP with a distillation objective from large\ntext-to-image generative models such as Stable-Diffusion which are relatively\ngood at visio-linguistic reasoning tasks. On the challenging Winoground\ncompositional reasoning benchmark, our method improves the absolute\nvisio-linguistic performance of different CLIP models by up to 7%, while on the\nARO dataset, our method improves the visio-linguistic performance by upto 3%.\nAs a byproduct of inducing visio-linguistic reasoning into CLIP, we also find\nthat the zero-shot performance improves marginally on a variety of downstream\ndatasets. Our method reinforces that carefully designed distillation objectives\nfrom generative models can be leveraged to extend existing contrastive\nimage-text models with improved visio-linguistic reasoning capabilities.\n","authors":["Samyadeep Basu","Maziar Sanjabi","Daniela Massiceti","Shell Xu Hu","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2307.09233v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15128v1","updated":"2023-07-27T18:04:45Z","published":"2023-07-27T18:04:45Z","title":"End-to-end Remote Sensing Change Detection of Unregistered Bi-temporal\n  Images for Natural Disasters","summary":"  Change detection based on remote sensing images has been a prominent area of\ninterest in the field of remote sensing. Deep networks have demonstrated\nsignificant success in detecting changes in bi-temporal remote sensing images\nand have found applications in various fields. Given the degradation of natural\nenvironments and the frequent occurrence of natural disasters, accurately and\nswiftly identifying damaged buildings in disaster-stricken areas through remote\nsensing images holds immense significance. This paper aims to investigate\nchange detection specifically for natural disasters. Considering that existing\npublic datasets used in change detection research are registered, which does\nnot align with the practical scenario where bi-temporal images are not matched,\nthis paper introduces an unregistered end-to-end change detection synthetic\ndataset called xBD-E2ECD. Furthermore, we propose an end-to-end change\ndetection network named E2ECDNet, which takes an unregistered bi-temporal image\npair as input and simultaneously generates the flow field prediction result and\nthe change detection prediction result. It is worth noting that our E2ECDNet\nalso supports change detection for registered image pairs, as registration can\nbe seen as a special case of non-registration. Additionally, this paper\nredefines the criteria for correctly predicting a positive case and introduces\nneighborhood-based change detection evaluation metrics. The experimental\nresults have demonstrated significant improvements.\n","authors":["Guiqin Zhao","Lianlei Shan","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.15128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.03906v3","updated":"2023-07-27T18:02:40Z","published":"2021-12-07T18:58:33Z","title":"Cross-modal Manifold Cutmix for Self-supervised Video Representation\n  Learning","summary":"  Contrastive representation learning of videos highly relies on the\navailability of millions of unlabelled videos. This is practical for videos\navailable on web but acquiring such large scale of videos for real-world\napplications is very expensive and laborious.\n  Therefore, in this paper we focus on designing video augmentation for\nself-supervised learning, we first analyze the best strategy to mix videos to\ncreate a new augmented video sample. Then, the question remains, can we make\nuse of the other modalities in videos for data mixing? To this end, we propose\nCross-Modal Manifold Cutmix (CMMC) that inserts a video tesseract into another\nvideo tesseract in the feature space across two different modalities. We find\nthat our video mixing strategy STC-mix, i.e. preliminary mixing of videos\nfollowed by CMMC across different modalities in a video, improves the quality\nof learned video representations. We conduct thorough experiments for two\ndownstream tasks: action recognition and video retrieval on two small scale\nvideo datasets UCF101, and HMDB51. We also demonstrate the effectiveness of our\nSTC-mix on NTU dataset where domain knowledge is limited.\n  We show that the performance of our STC-mix on both the downstream tasks is\non par with the other self-supervised approaches while requiring less training\ndata.\n","authors":["Srijan Das","Michael S. Ryoo"],"pdf_url":"https://arxiv.org/pdf/2112.03906v3.pdf","comment":"Accepted at MVA 2023"},{"id":"http://arxiv.org/abs/2307.15105v1","updated":"2023-07-27T17:48:29Z","published":"2023-07-27T17:48:29Z","title":"Detecting Morphing Attacks via Continual Incremental Training","summary":"  Scenarios in which restrictions in data transfer and storage limit the\npossibility to compose a single dataset -- also exploiting different data\nsources -- to perform a batch-based training procedure, make the development of\nrobust models particularly challenging. We hypothesize that the recent\nContinual Learning (CL) paradigm may represent an effective solution to enable\nincremental training, even through multiple sites. Indeed, a basic assumption\nof CL is that once a model has been trained, old data can no longer be used in\nsuccessive training iterations and in principle can be deleted. Therefore, in\nthis paper, we investigate the performance of different Continual Learning\nmethods in this scenario, simulating a learning model that is updated every\ntime a new chunk of data, even of variable size, is available. Experimental\nresults reveal that a particular CL method, namely Learning without Forgetting\n(LwF), is one of the best-performing algorithms. Then, we investigate its usage\nand parametrization in Morphing Attack Detection and Object Classification\ntasks, specifically with respect to the amount of new training data that became\navailable.\n","authors":["Lorenzo Pellegrini","Guido Borghi","Annalisa Franco","Davide Maltoni"],"pdf_url":"https://arxiv.org/pdf/2307.15105v1.pdf","comment":"Paper accepted in IJCB 2023 conference"},{"id":"http://arxiv.org/abs/2307.15099v1","updated":"2023-07-27T14:54:28Z","published":"2023-07-27T14:54:28Z","title":"Clustering of illustrations by atmosphere using a combination of\n  supervised and unsupervised learning","summary":"  The distribution of illustrations on social media, such as Twitter and Pixiv\nhas increased with the growing popularity of animation, games, and animated\nmovies. The \"atmosphere\" of illustrations plays an important role in user\npreferences. Classifying illustrations by atmosphere can be helpful for\nrecommendations and searches. However, assigning clear labels to the elusive\n\"atmosphere\" and conventional supervised classification is not always\npractical. Furthermore, even images with similar colors, edges, and low-level\nfeatures may not have similar atmospheres, making classification based on\nlow-level features challenging. In this paper, this problem is solved using\nboth supervised and unsupervised learning with pseudo-labels. The feature\nvectors are obtained using the supervised method with pseudo-labels that\ncontribute to an ambiguous atmosphere. Further, clustering is performed based\non these feature vectors. Experimental analyses show that our method\noutperforms conventional methods in human-like clustering on datasets manually\nclassified by humans.\n","authors":["Keisuke Kubota","Masahiro Okuda"],"pdf_url":"https://arxiv.org/pdf/2307.15099v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.15098v1","updated":"2023-07-27T14:17:24Z","published":"2023-07-27T14:17:24Z","title":"Self-Supervised Learning for Improved Synthetic Aperture Sonar Target\n  Recognition","summary":"  This study explores the application of self-supervised learning (SSL) for\nimproved target recognition in synthetic aperture sonar (SAS) imagery. The\nunique challenges of underwater environments make traditional computer vision\ntechniques, which rely heavily on optical camera imagery, less effective. SAS,\nwith its ability to generate high-resolution imagery, emerges as a preferred\nchoice for underwater imaging. However, the voluminous high-resolution SAS data\npresents a significant challenge for labeling; a crucial step for training deep\nneural networks (DNNs).\n  SSL, which enables models to learn features in data without the need for\nlabels, is proposed as a potential solution to the data labeling challenge in\nSAS. The study evaluates the performance of two prominent SSL algorithms,\nMoCov2 and BYOL, against the well-regarded supervised learning model, ResNet18,\nfor binary image classification tasks. The findings suggest that while both SSL\nmodels can outperform a fully supervised model with access to a small number of\nlabels in a few-shot scenario, they do not exceed it when all the labels are\nused.\n  The results underscore the potential of SSL as a viable alternative to\ntraditional supervised learning, capable of maintaining task performance while\nreducing the time and costs associated with data labeling. The study also\ncontributes to the growing body of evidence supporting the use of SSL in remote\nsensing and could stimulate further research in this area.\n","authors":["BW Sheffield"],"pdf_url":"https://arxiv.org/pdf/2307.15098v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.15053v1","updated":"2023-07-27T17:57:42Z","published":"2023-07-27T17:57:42Z","title":"On (Normalised) Discounted Cumulative Gain as an Offline Evaluation\n  Metric for Top-$n$ Recommendation","summary":"  Approaches to recommendation are typically evaluated in one of two ways: (1)\nvia a (simulated) online experiment, often seen as the gold standard, or (2)\nvia some offline evaluation procedure, where the goal is to approximate the\noutcome of an online experiment. Several offline evaluation metrics have been\nadopted in the literature, inspired by ranking metrics prevalent in the field\nof Information Retrieval. (Normalised) Discounted Cumulative Gain (nDCG) is one\nsuch metric that has seen widespread adoption in empirical studies, and higher\n(n)DCG values have been used to present new methods as the state-of-the-art in\ntop-$n$ recommendation for many years.\n  Our work takes a critical look at this approach, and investigates when we can\nexpect such metrics to approximate the gold standard outcome of an online\nexperiment. We formally present the assumptions that are necessary to consider\nDCG an unbiased estimator of online reward and provide a derivation for this\nmetric from first principles, highlighting where we deviate from its\ntraditional uses in IR. Importantly, we show that normalising the metric\nrenders it inconsistent, in that even when DCG is unbiased, ranking competing\nmethods by their normalised DCG can invert their relative order. Through a\ncorrelation analysis between off- and on-line experiments conducted on a\nlarge-scale recommendation platform, we show that our unbiased DCG estimates\nstrongly correlate with online reward, even when some of the metric's inherent\nassumptions are violated. This statement no longer holds for its normalised\nvariant, suggesting that nDCG's practical utility may be limited.\n","authors":["Olivier Jeunen","Ivan Potapov","Aleksei Ustimenko"],"pdf_url":"https://arxiv.org/pdf/2307.15053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14031v2","updated":"2023-07-27T16:47:28Z","published":"2023-04-27T08:56:42Z","title":"Boosting Big Brother: Attacking Search Engines with Encodings","summary":"  Search engines are vulnerable to attacks against indexing and searching via\ntext encoding manipulation. By imperceptibly perturbing text using uncommon\nencoded representations, adversaries can control results across search engines\nfor specific search queries. We demonstrate that this attack is successful\nagainst two major commercial search engines - Google and Bing - and one open\nsource search engine - Elasticsearch. We further demonstrate that this attack\nis successful against LLM chat search including Bing's GPT-4 chatbot and\nGoogle's Bard chatbot. We also present a variant of the attack targeting text\nsummarization and plagiarism detection models, two ML tasks closely tied to\nsearch. We provide a set of defenses against these techniques and warn that\nadversaries can leverage these attacks to launch disinformation campaigns\nagainst unsuspecting users, motivating the need for search engine maintainers\nto patch deployed systems.\n","authors":["Nicholas Boucher","Luca Pajola","Ilia Shumailov","Ross Anderson","Mauro Conti"],"pdf_url":"https://arxiv.org/pdf/2304.14031v2.pdf","comment":"To appear in the 26th Symposium on Research in Attacks, Intrusions\n  and Defenses (RAID). Revisions: Adds table summarizing attacks"},{"id":"http://arxiv.org/abs/2307.14956v1","updated":"2023-07-27T15:48:13Z","published":"2023-07-27T15:48:13Z","title":"The Effect of Third Party Implementations on Reproducibility","summary":"  Reproducibility of recommender systems research has come under scrutiny\nduring recent years. Along with works focusing on repeating experiments with\ncertain algorithms, the research community has also started discussing various\naspects of evaluation and how these affect reproducibility. We add a novel\nangle to this discussion by examining how unofficial third-party\nimplementations could benefit or hinder reproducibility. Besides giving a\ngeneral overview, we thoroughly examine six third-party implementations of a\npopular recommender algorithm and compare them to the official version on five\npublic datasets. In the light of our alarming findings we aim to draw the\nattention of the research community to this neglected aspect of\nreproducibility.\n","authors":["Balázs Hidasi","Ádám Tibor Czapp"],"pdf_url":"https://arxiv.org/pdf/2307.14956v1.pdf","comment":"Appearing in the Proceedings of the 17th ACM Conference on\n  Recommender Systems (RecSys'23)"},{"id":"http://arxiv.org/abs/2307.14951v1","updated":"2023-07-27T15:46:41Z","published":"2023-07-27T15:46:41Z","title":"Widespread Flaws in Offline Evaluation of Recommender Systems","summary":"  Even though offline evaluation is just an imperfect proxy of online\nperformance -- due to the interactive nature of recommenders -- it will\nprobably remain the primary way of evaluation in recommender systems research\nfor the foreseeable future, since the proprietary nature of production\nrecommenders prevents independent validation of A/B test setups and\nverification of online results. Therefore, it is imperative that offline\nevaluation setups are as realistic and as flawless as they can be.\nUnfortunately, evaluation flaws are quite common in recommender systems\nresearch nowadays, due to later works copying flawed evaluation setups from\ntheir predecessors without questioning their validity. In the hope of improving\nthe quality of offline evaluation of recommender systems, we discuss four of\nthese widespread flaws and why researchers should avoid them.\n","authors":["Balázs Hidasi","Ádám Tibor Czapp"],"pdf_url":"https://arxiv.org/pdf/2307.14951v1.pdf","comment":"Appearing in the Proceedings of the 17th ACM Conference on\n  Recommender Systems"},{"id":"http://arxiv.org/abs/2307.14906v1","updated":"2023-07-27T14:47:38Z","published":"2023-07-27T14:47:38Z","title":"Scaling Session-Based Transformer Recommendations using Optimized\n  Negative Sampling and Loss Functions","summary":"  This work introduces TRON, a scalable session-based Transformer Recommender\nusing Optimized Negative-sampling. Motivated by the scalability and performance\nlimitations of prevailing models such as SASRec and GRU4Rec+, TRON integrates\ntop-k negative sampling and listwise loss functions to enhance its\nrecommendation accuracy. Evaluations on relevant large-scale e-commerce\ndatasets show that TRON improves upon the recommendation quality of current\nmethods while maintaining training speeds similar to SASRec. A live A/B test\nyielded an 18.14% increase in click-through rate over SASRec, highlighting the\npotential of TRON in practical settings. For further research, we provide\naccess to our source code at https://github.com/otto-de/TRON and an anonymized\ndataset at https://github.com/otto-de/recsys-dataset.\n","authors":["Timo Wilm","Philipp Normann","Sophie Baumeister","Paul-Vincent Kobow"],"pdf_url":"https://arxiv.org/pdf/2307.14906v1.pdf","comment":"Accepted at the Seventeenth ACM Conference on Recommender Systems\n  (RecSys '23)"},{"id":"http://arxiv.org/abs/2306.09060v2","updated":"2023-07-27T11:35:18Z","published":"2023-06-15T11:44:29Z","title":"Fast and Examination-agnostic Reciprocal Recommendation in Matching\n  Markets","summary":"  In matching markets such as job posting and online dating platforms, the\nrecommender system plays a critical role in the success of the platform. Unlike\nstandard recommender systems that suggest items to users, reciprocal\nrecommender systems (RRSs) that suggest other users must take into account the\nmutual interests of users. In addition, ensuring that recommendation\nopportunities do not disproportionately favor popular users is essential for\nthe total number of matches and for fairness among users. Existing\nrecommendation methods in matching markets, however, face computational\nchallenges on real-world scale platforms and depend on specific examination\nfunctions in the position-based model (PBM). In this paper, we introduce the\nreciprocal recommendation method based on the matching with transferable\nutility (TU matching) model in the context of ranking recommendations in\nmatching markets, and propose a faster and examination-agnostic algorithm.\nFurthermore, we evaluate our approach on experiments with synthetic data and\nreal-world data from an online dating platform in Japan. Our method performs\nbetter than or as well as existing methods in terms of the total number of\nmatches and works well even in relatively large datasets for which one existing\nmethod does not work.\n","authors":["Yoji Tomita","Riku Togashi","Yuriko Hashizume","Naoto Ohsaka"],"pdf_url":"https://arxiv.org/pdf/2306.09060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00083v2","updated":"2023-07-27T09:15:00Z","published":"2023-01-31T20:26:16Z","title":"In-Context Retrieval-Augmented Language Models","summary":"  Retrieval-Augmented Language Modeling (RALM) methods, which condition a\nlanguage model (LM) on relevant documents from a grounding corpus during\ngeneration, were shown to significantly improve language modeling performance.\nIn addition, they can mitigate the problem of factually inaccurate text\ngeneration and provide natural source attribution mechanism. Existing RALM\napproaches focus on modifying the LM architecture in order to facilitate the\nincorporation of external information, significantly complicating deployment.\nThis paper considers a simple alternative, which we dub In-Context RALM:\nleaving the LM architecture unchanged and prepending grounding documents to the\ninput, without any further training of the LM. We show that In-Context RALM\nthat builds on off-the-shelf general purpose retrievers provides surprisingly\nlarge LM gains across model sizes and diverse corpora. We also demonstrate that\nthe document retrieval and ranking mechanism can be specialized to the RALM\nsetting to further boost performance. We conclude that In-Context RALM has\nconsiderable potential to increase the prevalence of LM grounding, particularly\nin settings where a pretrained LM must be used without modification or even via\nAPI access.\n","authors":["Ori Ram","Yoav Levine","Itay Dalmedigos","Dor Muhlgay","Amnon Shashua","Kevin Leyton-Brown","Yoav Shoham"],"pdf_url":"https://arxiv.org/pdf/2302.00083v2.pdf","comment":"Accepted for publication in Transactions of the Association for\n  Computational Linguistics (TACL). pre-MIT Press publication version"},{"id":"http://arxiv.org/abs/2201.03622v3","updated":"2023-07-27T08:19:29Z","published":"2022-01-10T20:08:40Z","title":"Graph-Based Recommendation System Enhanced with Community Detection","summary":"  Many researchers have used tag information to improve the performance of\nrecommendation techniques in recommender systems. Examining the tags of users\nwill help to get their interests and leads to more accuracy in the\nrecommendations. Since user-defined tags are chosen freely and without any\nrestrictions, problems arise in determining their exact meaning and the\nsimilarity of tags. However, using thesaurus and ontologies to find the meaning\nof tags is not very efficient due to their free definition by users and the use\nof different languages in many data sets. Therefore, this article uses\nmathematical and statistical methods to determine lexical similarity and\nco-occurrence tags solution to assign semantic similarity. On the other hand,\ndue to the change of users' interests over time this article has considered the\ntime of tag assignments in co-occurrence tags for determining similarity of\ntags. Then the graph is created based on similarity of tags. For modeling the\ninterests of the users, the communities of tags are determined by using\ncommunity detection methods. So, recommendations based on the communities of\ntags and similarity between resources are done. The performance of the proposed\nmethod has been evaluated using two criteria of precision and recall through\nevaluations on two public datasets. The evaluation results show that the\nprecision and recall of the proposed method have significantly improved,\ncompared to the other methods. According to the experimental results, the\ncriteria of recall and precision have been improved, on average by 5% and 7%\nrespectively.\n","authors":["Zeinab Shokrzadeh","Mohammad-Reza Feizi-Derakhshi","Mohammad-Ali Balafar","Jamshid Bagherzadeh-Mohasefi"],"pdf_url":"https://arxiv.org/pdf/2201.03622v3.pdf","comment":"This is a preprint of an article published in \"Scientific\n  Programming\""},{"id":"http://arxiv.org/abs/2307.12798v3","updated":"2023-07-27T07:20:28Z","published":"2023-07-24T13:51:19Z","title":"RRAML: Reinforced Retrieval Augmented Machine Learning","summary":"  The emergence of large language models (LLMs) has revolutionized machine\nlearning and related fields, showcasing remarkable abilities in comprehending,\ngenerating, and manipulating human language. However, their conventional usage\nthrough API-based text prompt submissions imposes certain limitations in terms\nof context constraints and external source availability. To address these\nchallenges, we propose a novel framework called Reinforced Retrieval Augmented\nMachine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs\nwith supporting information retrieved by a purpose-built retriever from a vast\nuser-provided database. By leveraging recent advancements in reinforcement\nlearning, our method effectively addresses several critical challenges.\nFirstly, it circumvents the need for accessing LLM gradients. Secondly, our\nmethod alleviates the burden of retraining LLMs for specific tasks, as it is\noften impractical or impossible due to restricted access to the model and the\ncomputational intensity involved. Additionally we seamlessly link the\nretriever's task with the reasoner, mitigating hallucinations and reducing\nirrelevant, and potentially damaging retrieved documents. We believe that the\nresearch agenda outlined in this paper has the potential to profoundly impact\nthe field of AI, democratizing access to and utilization of LLMs for a wide\nrange of entities.\n","authors":["Andrea Bacciu","Florin Cuconasu","Federico Siciliano","Fabrizio Silvestri","Nicola Tonellotto","Giovanni Trappolini"],"pdf_url":"https://arxiv.org/pdf/2307.12798v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.14704v4","updated":"2023-07-27T04:07:02Z","published":"2022-05-29T16:07:30Z","title":"Decoupling Knowledge from Memorization: Retrieval-augmented Prompt\n  Learning","summary":"  Prompt learning approaches have made waves in natural language processing by\ninducing better few-shot performance while they still follow a parametric-based\nlearning paradigm; the oblivion and rote memorization problems in learning may\nencounter unstable generalization issues. Specifically, vanilla prompt learning\nmay struggle to utilize atypical instances by rote during fully-supervised\ntraining or overfit shallow patterns with low-shot data. To alleviate such\nlimitations, we develop RetroPrompt with the motivation of decoupling knowledge\nfrom memorization to help the model strike a balance between generalization and\nmemorization. In contrast with vanilla prompt learning, RetroPrompt constructs\nan open-book knowledge-store from training instances and implements a retrieval\nmechanism during the process of input, training and inference, thus equipping\nthe model with the ability to retrieve related contexts from the training\ncorpus as cues for enhancement. Extensive experiments demonstrate that\nRetroPrompt can obtain better performance in both few-shot and zero-shot\nsettings. Besides, we further illustrate that our proposed RetroPrompt can\nyield better generalization abilities with new datasets. Detailed analysis of\nmemorization indeed reveals RetroPrompt can reduce the reliance of language\nmodels on memorization; thus, improving generalization for downstream tasks.\nCode is available in\nhttps://github.com/zjunlp/PromptKG/tree/main/research/RetroPrompt.\n","authors":["Xiang Chen","Lei Li","Ningyu Zhang","Xiaozhuan Liang","Shumin Deng","Chuanqi Tan","Fei Huang","Luo Si","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2205.14704v4.pdf","comment":"NeurIPS 2022 (Spotlight)"},{"id":"http://arxiv.org/abs/2307.09751v2","updated":"2023-07-27T02:32:08Z","published":"2023-07-19T05:23:43Z","title":"Information Retrieval Meets Large Language Models: A Strategic Report\n  from Chinese IR Community","summary":"  The research field of Information Retrieval (IR) has evolved significantly,\nexpanding beyond traditional search to meet diverse user information needs.\nRecently, Large Language Models (LLMs) have demonstrated exceptional\ncapabilities in text understanding, generation, and knowledge inference,\nopening up exciting avenues for IR research. LLMs not only facilitate\ngenerative retrieval but also offer improved solutions for user understanding,\nmodel evaluation, and user-system interactions. More importantly, the\nsynergistic relationship among IR models, LLMs, and humans forms a new\ntechnical paradigm that is more powerful for information seeking. IR models\nprovide real-time and relevant information, LLMs contribute internal knowledge,\nand humans play a central role of demanders and evaluators to the reliability\nof information services. Nevertheless, significant challenges exist, including\ncomputational costs, credibility concerns, domain-specific limitations, and\nethical considerations. To thoroughly discuss the transformative impact of LLMs\non IR research, the Chinese IR community conducted a strategic workshop in\nApril 2023, yielding valuable insights. This paper provides a summary of the\nworkshop's outcomes, including the rethinking of IR's core values, the mutual\nenhancement of LLMs and IR, the proposal of a novel IR technical paradigm, and\nopen challenges.\n","authors":["Qingyao Ai","Ting Bai","Zhao Cao","Yi Chang","Jiawei Chen","Zhumin Chen","Zhiyong Cheng","Shoubin Dong","Zhicheng Dou","Fuli Feng","Shen Gao","Jiafeng Guo","Xiangnan He","Yanyan Lan","Chenliang Li","Yiqun Liu","Ziyu Lyu","Weizhi Ma","Jun Ma","Zhaochun Ren","Pengjie Ren","Zhiqiang Wang","Mingwen Wang","Ji-Rong Wen","Le Wu","Xin Xin","Jun Xu","Dawei Yin","Peng Zhang","Fan Zhang","Weinan Zhang","Min Zhang","Xiaofei Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.09751v2.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2307.15142v1","updated":"2023-07-27T18:37:49Z","published":"2023-07-27T18:37:49Z","title":"Reconciling the accuracy-diversity trade-off in recommendations","summary":"  In recommendation settings, there is an apparent trade-off between the goals\nof accuracy (to recommend items a user is most likely to want) and diversity\n(to recommend items representing a range of categories). As such, real-world\nrecommender systems often explicitly incorporate diversity separately from\naccuracy. This approach, however, leaves a basic question unanswered: Why is\nthere a trade-off in the first place?\n  We show how the trade-off can be explained via a user's consumption\nconstraints -- users typically only consume a few of the items they are\nrecommended. In a stylized model we introduce, objectives that account for this\nconstraint induce diverse recommendations, while objectives that do not account\nfor this constraint induce homogeneous recommendations. This suggests that\naccuracy and diversity appear misaligned because standard accuracy metrics do\nnot consider consumption constraints. Our model yields precise and\ninterpretable characterizations of diversity in different settings, giving\npractical insights into the design of diverse recommendations.\n","authors":["Kenny Peng","Manish Raghavan","Emma Pierson","Jon Kleinberg","Nikhil Garg"],"pdf_url":"https://arxiv.org/pdf/2307.15142v1.pdf","comment":"34 pages, 5 figures"},{"id":"http://arxiv.org/abs/2301.04253v2","updated":"2023-07-27T18:13:40Z","published":"2023-01-11T00:22:56Z","title":"Towards Answering Climate Questionnaires from Unstructured Climate\n  Reports","summary":"  The topic of Climate Change (CC) has received limited attention in NLP\ndespite its urgency. Activists and policymakers need NLP tools to effectively\nprocess the vast and rapidly growing unstructured textual climate reports into\nstructured form. To tackle this challenge we introduce two new large-scale\nclimate questionnaire datasets and use their existing structure to train\nself-supervised models. We conduct experiments to show that these models can\nlearn to generalize to climate disclosures of different organizations types\nthan seen during training. We then use these models to help align texts from\nunstructured climate documents to the semi-structured questionnaires in a human\npilot study. Finally, to support further NLP research in the climate domain we\nintroduce a benchmark of existing climate text classification datasets to\nbetter evaluate and compare existing models.\n","authors":["Daniel Spokoyny","Tanmay Laud","Tom Corringham","Taylor Berg-Kirkpatrick"],"pdf_url":"https://arxiv.org/pdf/2301.04253v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.11679v2","updated":"2023-07-27T15:32:45Z","published":"2022-09-22T04:32:51Z","title":"Rethinking Missing Data: Aleatoric Uncertainty-Aware Recommendation","summary":"  Historical interactions are the default choice for recommender model\ntraining, which typically exhibit high sparsity, i.e., most user-item pairs are\nunobserved missing data. A standard choice is treating the missing data as\nnegative training samples and estimating interaction likelihood between\nuser-item pairs along with the observed interactions. In this way, some\npotential interactions are inevitably mislabeled during training, which will\nhurt the model fidelity, hindering the model to recall the mislabeled items,\nespecially the long-tail ones. In this work, we investigate the mislabeling\nissue from a new perspective of aleatoric uncertainty, which describes the\ninherent randomness of missing data. The randomness pushes us to go beyond\nmerely the interaction likelihood and embrace aleatoric uncertainty modeling.\nTowards this end, we propose a new Aleatoric Uncertainty-aware Recommendation\n(AUR) framework that consists of a new uncertainty estimator along with a\nnormal recommender model. According to the theory of aleatoric uncertainty, we\nderive a new recommendation objective to learn the estimator. As the chance of\nmislabeling reflects the potential of a pair, AUR makes recommendations\naccording to the uncertainty, which is demonstrated to improve the\nrecommendation performance of less popular items without sacrificing the\noverall performance. We instantiate AUR on three representative recommender\nmodels: Matrix Factorization (MF), LightGCN, and VAE from mainstream model\narchitectures. Extensive results on two real-world datasets validate the\neffectiveness of AUR w.r.t. better recommendation results, especially on\nlong-tail items.\n","authors":["Chenxu Wang","Fuli Feng","Yang Zhang","Qifan Wang","Xunhan Hu","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2209.11679v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.15053v1","updated":"2023-07-27T17:57:42Z","published":"2023-07-27T17:57:42Z","title":"On (Normalised) Discounted Cumulative Gain as an Offline Evaluation\n  Metric for Top-$n$ Recommendation","summary":"  Approaches to recommendation are typically evaluated in one of two ways: (1)\nvia a (simulated) online experiment, often seen as the gold standard, or (2)\nvia some offline evaluation procedure, where the goal is to approximate the\noutcome of an online experiment. Several offline evaluation metrics have been\nadopted in the literature, inspired by ranking metrics prevalent in the field\nof Information Retrieval. (Normalised) Discounted Cumulative Gain (nDCG) is one\nsuch metric that has seen widespread adoption in empirical studies, and higher\n(n)DCG values have been used to present new methods as the state-of-the-art in\ntop-$n$ recommendation for many years.\n  Our work takes a critical look at this approach, and investigates when we can\nexpect such metrics to approximate the gold standard outcome of an online\nexperiment. We formally present the assumptions that are necessary to consider\nDCG an unbiased estimator of online reward and provide a derivation for this\nmetric from first principles, highlighting where we deviate from its\ntraditional uses in IR. Importantly, we show that normalising the metric\nrenders it inconsistent, in that even when DCG is unbiased, ranking competing\nmethods by their normalised DCG can invert their relative order. Through a\ncorrelation analysis between off- and on-line experiments conducted on a\nlarge-scale recommendation platform, we show that our unbiased DCG estimates\nstrongly correlate with online reward, even when some of the metric's inherent\nassumptions are violated. This statement no longer holds for its normalised\nvariant, suggesting that nDCG's practical utility may be limited.\n","authors":["Olivier Jeunen","Ivan Potapov","Aleksei Ustimenko"],"pdf_url":"https://arxiv.org/pdf/2307.15053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15045v1","updated":"2023-07-27T17:51:52Z","published":"2023-07-27T17:51:52Z","title":"A Transformer-based Approach for Arabic Offline Handwritten Text\n  Recognition","summary":"  Handwriting recognition is a challenging and critical problem in the fields\nof pattern recognition and machine learning, with applications spanning a wide\nrange of domains. In this paper, we focus on the specific issue of recognizing\noffline Arabic handwritten text. Existing approaches typically utilize a\ncombination of convolutional neural networks for image feature extraction and\nrecurrent neural networks for temporal modeling, with connectionist temporal\nclassification used for text generation. However, these methods suffer from a\nlack of parallelization due to the sequential nature of recurrent neural\nnetworks. Furthermore, these models cannot account for linguistic rules,\nnecessitating the use of an external language model in the post-processing\nstage to boost accuracy. To overcome these issues, we introduce two alternative\narchitectures, namely the Transformer Transducer and the standard\nsequence-to-sequence Transformer, and compare their performance in terms of\naccuracy and speed. Our approach can model language dependencies and relies\nonly on the attention mechanism, thereby making it more parallelizable and less\ncomplex. We employ pre-trained Transformers for both image understanding and\nlanguage modeling. Our evaluation on the Arabic KHATT dataset demonstrates that\nour proposed method outperforms the current state-of-the-art approaches for\nrecognizing offline Arabic handwritten text.\n","authors":["Saleh Momeni","Bagher BabaAli"],"pdf_url":"https://arxiv.org/pdf/2307.15045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15043v1","updated":"2023-07-27T17:49:12Z","published":"2023-07-27T17:49:12Z","title":"Universal and Transferable Adversarial Attacks on Aligned Language\n  Models","summary":"  Because \"out-of-the-box\" large language models are capable of generating a\ngreat deal of objectionable content, recent work has focused on aligning these\nmodels in an attempt to prevent undesirable generation. While there has been\nsome success at circumventing these measures -- so-called \"jailbreaks\" against\nLLMs -- these attacks have required significant human ingenuity and are brittle\nin practice. In this paper, we propose a simple and effective attack method\nthat causes aligned language models to generate objectionable behaviors.\nSpecifically, our approach finds a suffix that, when attached to a wide range\nof queries for an LLM to produce objectionable content, aims to maximize the\nprobability that the model produces an affirmative response (rather than\nrefusing to answer). However, instead of relying on manual engineering, our\napproach automatically produces these adversarial suffixes by a combination of\ngreedy and gradient-based search techniques, and also improves over past\nautomatic prompt generation methods.\n  Surprisingly, we find that the adversarial prompts generated by our approach\nare quite transferable, including to black-box, publicly released LLMs.\nSpecifically, we train an adversarial attack suffix on multiple prompts (i.e.,\nqueries asking for many different types of objectionable content), as well as\nmultiple models (in our case, Vicuna-7B and 13B). When doing so, the resulting\nattack suffix is able to induce objectionable content in the public interfaces\nto ChatGPT, Bard, and Claude, as well as open source LLMs such as LLaMA-2-Chat,\nPythia, Falcon, and others. In total, this work significantly advances the\nstate-of-the-art in adversarial attacks against aligned language models,\nraising important questions about how such systems can be prevented from\nproducing objectionable information. Code is available at\ngithub.com/llm-attacks/llm-attacks.\n","authors":["Andy Zou","Zifan Wang","J. Zico Kolter","Matt Fredrikson"],"pdf_url":"https://arxiv.org/pdf/2307.15043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.10291v2","updated":"2023-07-27T17:48:41Z","published":"2022-06-21T12:16:45Z","title":"Algorithmic Gaussianization through Sketching: Converting Data into\n  Sub-gaussian Random Designs","summary":"  Algorithmic Gaussianization is a phenomenon that can arise when using\nrandomized sketching or sampling methods to produce smaller representations of\nlarge datasets: For certain tasks, these sketched representations have been\nobserved to exhibit many robust performance characteristics that are known to\noccur when a data sample comes from a sub-gaussian random design, which is a\npowerful statistical model of data distributions. However, this phenomenon has\nonly been studied for specific tasks and metrics, or by relying on\ncomputationally expensive methods. We address this by providing an algorithmic\nframework for gaussianizing data distributions via averaging, proving that it\nis possible to efficiently construct data sketches that are nearly\nindistinguishable (in terms of total variation distance) from sub-gaussian\nrandom designs. In particular, relying on a recently introduced sketching\ntechnique called Leverage Score Sparsified (LESS) embeddings, we show that one\ncan construct an $n\\times d$ sketch of an $N\\times d$ matrix $A$, where $n\\ll\nN$, that is nearly indistinguishable from a sub-gaussian design, in time\n$O(\\text{nnz}(A)\\log N + nd^2)$, where $\\text{nnz}(A)$ is the number of\nnon-zero entries in $A$. As a consequence, strong statistical guarantees and\nprecise asymptotics available for the estimators produced from sub-gaussian\ndesigns (e.g., for least squares and Lasso regression, covariance estimation,\nlow-rank approximation, etc.) can be straightforwardly adapted to our sketching\nframework. We illustrate this with a new approximation guarantee for sketched\nleast squares, among other examples.\n","authors":["Michał Dereziński"],"pdf_url":"https://arxiv.org/pdf/2206.10291v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02109v2","updated":"2023-07-27T17:44:50Z","published":"2023-04-14T19:21:42Z","title":"Synergies Between Federated Learning and O-RAN: Towards an Elastic\n  Virtualized Architecture for Multiple Distributed Machine Learning Services","summary":"  Federated learning (FL) is the most popular distributed machine learning\ntechnique. However, implementation of FL over modern wireless networks faces\nkey challenges caused by (i) dynamics of the network conditions and (ii) the\ncoexistence of multiple FL services/tasks and other network services in the\nsystem, which are not jointly considered in prior works. Motivated by these\nchallenges, we introduce a generic FL paradigm over NextG networks, called\ndynamic multi-service FL (DMS-FL). We identify three unexplored design\nconsiderations in DMS-FL: (i) FL service operator accumulation, (ii) wireless\nresource fragmentation, and (iii) signal strength fluctuations. We take the\nfirst steps towards addressing these design considerations by proposing a novel\ndistributed ML architecture called elastic virtualized FL (EV-FL). EV-FL\nunleashes the full potential of Open RAN (O-RAN) systems and introduces an\nelastic resource provisioning methodology to execute FL services. It further\nconstitutes a multi-time-scale FL management system that introduces three\ndimensions into existing FL architectures: (i) virtualization, (ii)\nscalability, and (iii) elasticity. Through investigating EV-FL, we reveal a\nseries of open research directions for future work. We finally simulate EV-FL\nto demonstrate its potential in saving wireless resources and increasing\nfairness among FL services.\n","authors":["Payam Abdisarabshali","Nicholas Accurso","Filippo Malandra","Weifeng Su","Seyyedali Hosseinalipour"],"pdf_url":"https://arxiv.org/pdf/2305.02109v2.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.03811v2","updated":"2023-07-27T17:42:33Z","published":"2023-07-07T19:34:43Z","title":"Formulation Graphs for Mapping Structure-Composition of Battery\n  Electrolytes to Device Performance","summary":"  Advanced computational methods are being actively sought for addressing the\nchallenges associated with discovery and development of new combinatorial\nmaterial such as formulations. A widely adopted approach involves domain\ninformed high-throughput screening of individual components that can be\ncombined into a formulation. This manages to accelerate the discovery of new\ncompounds for a target application but still leave the process of identifying\nthe right 'formulation' from the shortlisted chemical space largely a\nlaboratory experiment-driven process. We report a deep learning model,\nFormulation Graph Convolution Network (F-GCN), that can map\nstructure-composition relationship of the individual components to the property\nof liquid formulation as whole. Multiple GCNs are assembled in parallel that\nfeaturize formulation constituents domain-intuitively on the fly. The resulting\nmolecular descriptors are scaled based on respective constituent's molar\npercentage in the formulation, followed by formalizing into a combined\ndescriptor that represents a complete formulation to an external learning\narchitecture. The use case of proposed formulation learning model is\ndemonstrated for battery electrolytes by training and testing it on two\nexemplary datasets representing electrolyte formulations vs battery performance\n-- one dataset is sourced from literature about Li/Cu half-cells, while the\nother is obtained by lab-experiments related to lithium-iodide full-cell\nchemistry. The model is shown to predict the performance metrics like Coulombic\nEfficiency (CE) and specific capacity of new electrolyte formulations with\nlowest reported errors. The best performing F-GCN model uses molecular\ndescriptors derived from molecular graphs that are informed with HOMO-LUMO and\nelectric moment properties of the molecules using a knowledge transfer\ntechnique.\n","authors":["Vidushi Sharma","Maxwell Giammona","Dmitry Zubarev","Andy Tek","Khanh Nugyuen","Linda Sundberg","Daniele Congiu","Young-Hye La"],"pdf_url":"https://arxiv.org/pdf/2307.03811v2.pdf","comment":"35 pages, 10 figures"},{"id":"http://arxiv.org/abs/2307.15034v1","updated":"2023-07-27T17:42:06Z","published":"2023-07-27T17:42:06Z","title":"Speeding up Fourier Neural Operators via Mixed Precision","summary":"  The Fourier neural operator (FNO) is a powerful technique for learning\nsurrogate maps for partial differential equation (PDE) solution operators. For\nmany real-world applications, which often require high-resolution data points,\ntraining time and memory usage are significant bottlenecks. While there are\nmixed-precision training techniques for standard neural networks, those work\nfor real-valued datatypes on finite dimensions and therefore cannot be directly\napplied to FNO, which crucially operates in the (complex-valued) Fourier domain\nand in function spaces. On the other hand, since the Fourier transform is\nalready an approximation (due to discretization error), we do not need to\nperform the operation at full precision. In this work, we (i) profile memory\nand runtime for FNO with full and mixed-precision training, (ii) conduct a\nstudy on the numerical stability of mixed-precision training of FNO, and (iii)\ndevise a training routine which substantially decreases training time and\nmemory usage (up to 34%), with little or no reduction in accuracy, on the\nNavier-Stokes and Darcy flow equations. Combined with the recently proposed\ntensorized FNO (Kossaifi et al., 2023), the resulting model has far better\nperformance while also being significantly faster than the original FNO.\n","authors":["Colin White","Renbo Tu","Jean Kossaifi","Gennady Pekhimenko","Kamyar Azizzadenesheli","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2307.15034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15019v1","updated":"2023-07-27T17:22:41Z","published":"2023-07-27T17:22:41Z","title":"Self-Supervised Graph Transformer for Deepfake Detection","summary":"  Deepfake detection methods have shown promising results in recognizing\nforgeries within a given dataset, where training and testing take place on the\nin-distribution dataset. However, their performance deteriorates significantly\nwhen presented with unseen samples. As a result, a reliable deepfake detection\nsystem must remain impartial to forgery types, appearance, and quality for\nguaranteed generalizable detection performance. Despite various attempts to\nenhance cross-dataset generalization, the problem remains challenging,\nparticularly when testing against common post-processing perturbations, such as\nvideo compression or blur. Hence, this study introduces a deepfake detection\nframework, leveraging a self-supervised pre-training model that delivers\nexceptional generalization ability, withstanding common corruptions and\nenabling feature explainability. The framework comprises three key components:\na feature extractor based on vision Transformer architecture that is\npre-trained via self-supervised contrastive learning methodology, a graph\nconvolution network coupled with a Transformer discriminator, and a graph\nTransformer relevancy map that provides a better understanding of manipulated\nregions and further explains the model's decision. To assess the effectiveness\nof the proposed framework, several challenging experiments are conducted,\nincluding in-data distribution performance, cross-dataset, cross-manipulation\ngeneralization, and robustness against common post-production perturbations.\nThe results achieved demonstrate the remarkable effectiveness of the proposed\ndeepfake detection framework, surpassing the current state-of-the-art\napproaches.\n","authors":["Aminollah Khormali","Jiann-Shiun Yuan"],"pdf_url":"https://arxiv.org/pdf/2307.15019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15017v1","updated":"2023-07-27T17:19:37Z","published":"2023-07-27T17:19:37Z","title":"Samplable Anonymous Aggregation for Private Federated Data Analysis","summary":"  We revisit the problem of designing scalable protocols for private statistics\nand private federated learning when each device holds its private data. Our\nfirst contribution is to propose a simple primitive that allows for efficient\nimplementation of several commonly used algorithms, and allows for privacy\naccounting that is close to that in the central setting without requiring the\nstrong trust assumptions it entails. Second, we propose a system architecture\nthat implements this primitive and perform a security analysis of the proposed\nsystem.\n","authors":["Kunal Talwar","Shan Wang","Audra McMillan","Vojta Jina","Vitaly Feldman","Bailey Basile","Aine Cahill","Yi Sheng Chan","Mike Chatzidakis","Junye Chen","Oliver Chick","Mona Chitnis","Suman Ganta","Yusuf Goren","Filip Granqvist","Kristine Guo","Frederic Jacobs","Omid Javidbakht","Albert Liu","Richard Low","Dan Mascenik","Steve Myers","David Park","Wonhee Park","Gianni Parsa","Tommy Pauly","Christian Priebe","Rehan Rishi","Guy Rothblum","Michael Scaria","Linmao Song","Congzheng Song","Karl Tarbe","Sebastian Vogt","Luke Winstrom","Shundong Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.15017v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2307.15016v1","updated":"2023-07-27T17:19:32Z","published":"2023-07-27T17:19:32Z","title":"How Good is Google Bard's Visual Understanding? An Empirical Study on\n  Open Challenges","summary":"  Google's Bard has emerged as a formidable competitor to OpenAI's ChatGPT in\nthe field of conversational AI. Notably, Bard has recently been updated to\nhandle visual inputs alongside text prompts during conversations. Given Bard's\nimpressive track record in handling textual inputs, we explore its capabilities\nin understanding and interpreting visual data (images) conditioned by text\nquestions. This exploration holds the potential to unveil new insights and\nchallenges for Bard and other forthcoming multi-modal Generative models,\nespecially in addressing complex computer vision problems that demand accurate\nvisual and language understanding. Specifically, in this study, we focus on 15\ndiverse task scenarios encompassing regular, camouflaged, medical, under-water\nand remote sensing data to comprehensively evaluate Bard's performance. Our\nprimary finding indicates that Bard still struggles in these vision scenarios,\nhighlighting the significant gap in vision-based understanding that needs to be\nbridged in future developments. We expect that this empirical study will prove\nvaluable in advancing future models, leading to enhanced capabilities in\ncomprehending and interpreting fine-grained visual data. Our project is\nreleased on https://github.com/htqin/GoogleBard-VisUnderstand\n","authors":["Haotong Qin","Ge-Peng Ji","Salman Khan","Deng-Ping Fan","Fahad Shahbaz Khan","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2307.15016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.02626v2","updated":"2023-07-27T17:19:03Z","published":"2021-06-04T17:39:36Z","title":"Dynamics of specialization in neural modules under resource constraints","summary":"  It has long been believed that the brain is highly modular both in terms of\nstructure and function, although recent evidence has led some to question the\nextent of both types of modularity. We used artificial neural networks to test\nthe hypothesis that structural modularity is sufficient to guarantee functional\nspecialization, and find that in general, this doesn't necessarily hold except\nat extreme levels. We then systematically tested which features of the\nenvironment and network do lead to the emergence of specialization. We used a\nsimple toy environment, task and network, allowing us precise control, and show\nthat in this setup, several distinct measures of specialization give\nqualitatively similar results. We further find that (1) specialization can only\nemerge in environments where features of that environment are meaningfully\nseparable, (2) specialization preferentially emerges when the network is\nstrongly resource-constrained, and (3) these findings are qualitatively similar\nacross different network architectures, but the quantitative relationships\ndepends on the architecture type. Finally, we show that functional\nspecialization varies dynamically across time, and demonstrate that these\ndynamics depend on both the timing and bandwidth of information flow in the\nnetwork. We conclude that a static notion of specialization, based on\nstructural modularity, is likely too simple a framework for understanding\nintelligent systems in situations of real-world complexity. We propose that\nthoroughly stress testing candidate definitions of functional modularity in\nsimplified scenarios before extending to more complex data, network models and\nelectrophysiological recordings is likely to be a fruitful approach.\n","authors":["Gabriel Béna","Dan F. M. Goodman"],"pdf_url":"https://arxiv.org/pdf/2106.02626v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13813v2","updated":"2023-07-27T17:17:33Z","published":"2023-07-25T20:33:48Z","title":"How to Scale Your EMA","summary":"  Preserving training dynamics across batch sizes is an important tool for\npractical machine learning as it enables the trade-off between batch size and\nwall-clock time. This trade-off is typically enabled by a scaling rule, for\nexample, in stochastic gradient descent, one should scale the learning rate\nlinearly with the batch size. Another important tool for practical machine\nlearning is the model Exponential Moving Average (EMA), which is a model copy\nthat does not receive gradient information, but instead follows its target\nmodel with some momentum. This model EMA can improve the robustness and\ngeneralization properties of supervised learning, stabilize pseudo-labeling,\nand provide a learning signal for Self-Supervised Learning (SSL). Prior works\nhave treated the model EMA separately from optimization, leading to different\ntraining dynamics across batch sizes and lower model performance. In this work,\nwe provide a scaling rule for optimization in the presence of model EMAs and\ndemonstrate its validity across a range of architectures, optimizers, and data\nmodalities. We also show the rule's validity where the model EMA contributes to\nthe optimization of the target model, enabling us to train EMA-based\npseudo-labeling and SSL methods at small and large batch sizes. For SSL, we\nenable training of BYOL up to batch size 24,576 without sacrificing\nperformance, optimally a 6$\\times$ wall-clock time reduction.\n","authors":["Dan Busbridge","Jason Ramapuram","Pierre Ablin","Tatiana Likhomanenko","Eeshan Gunesh Dhekane","Xavier Suau","Russ Webb"],"pdf_url":"https://arxiv.org/pdf/2307.13813v2.pdf","comment":"51 pages, 28 figures, 15 tables"},{"id":"http://arxiv.org/abs/2307.15010v1","updated":"2023-07-27T17:08:53Z","published":"2023-07-27T17:08:53Z","title":"Harnessing Synthetic Active Particles for Physical Reservoir Computing","summary":"  The processing of information is an indispensable property of living systems\nrealized by networks of active processes with enormous complexity. They have\ninspired many variants of modern machine learning one of them being reservoir\ncomputing, in which stimulating a network of nodes with fading memory enables\ncomputations and complex predictions. Reservoirs are implemented on computer\nhardware, but also on unconventional physical substrates such as mechanical\noscillators, spins, or bacteria often summarized as physical reservoir\ncomputing. Here we demonstrate physical reservoir computing with a synthetic\nactive microparticle system that self-organizes from an active and passive\ncomponent into inherently noisy nonlinear dynamical units. The\nself-organization and dynamical response of the unit is the result of a delayed\npropulsion of the microswimmer to a passive target. A reservoir of such units\nwith a self-coupling via the delayed response can perform predictive tasks\ndespite the strong noise resulting from Brownian motion of the microswimmers.\nTo achieve efficient noise suppression, we introduce a special architecture\nthat uses historical reservoir states for output. Our results pave the way for\nthe study of information processing in synthetic self-organized active particle\nsystems.\n","authors":["Xiangzun Wang","Frank Cichos"],"pdf_url":"https://arxiv.org/pdf/2307.15010v1.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.15007v1","updated":"2023-07-27T17:06:02Z","published":"2023-07-27T17:06:02Z","title":"Verifiable Feature Attributions: A Bridge between Post Hoc\n  Explainability and Inherent Interpretability","summary":"  With the increased deployment of machine learning models in various\nreal-world applications, researchers and practitioners alike have emphasized\nthe need for explanations of model behaviour. To this end, two broad strategies\nhave been outlined in prior literature to explain models. Post hoc explanation\nmethods explain the behaviour of complex black-box models by highlighting\nfeatures that are critical to model predictions; however, prior work has shown\nthat these explanations may not be faithful, and even more concerning is our\ninability to verify them. Specifically, it is nontrivial to evaluate if a given\nattribution is correct with respect to the underlying model. Inherently\ninterpretable models, on the other hand, circumvent these issues by explicitly\nencoding explanations into model architecture, meaning their explanations are\nnaturally faithful and verifiable, but they often exhibit poor predictive\nperformance due to their limited expressive power. In this work, we aim to\nbridge the gap between the aforementioned strategies by proposing Verifiability\nTuning (VerT), a method that transforms black-box models into models that\nnaturally yield faithful and verifiable feature attributions. We begin by\nintroducing a formal theoretical framework to understand verifiability and show\nthat attributions produced by standard models cannot be verified. We then\nleverage this framework to propose a method to build verifiable models and\nfeature attributions out of fully trained black-box models. Finally, we perform\nextensive experiments on semi-synthetic and real-world datasets, and show that\nVerT produces models that (1) yield explanations that are correct and\nverifiable and (2) are faithful to the original black-box models they are meant\nto explain.\n","authors":["Usha Bhalla","Suraj Srinivas","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2307.15007v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.13934v3","updated":"2023-07-27T16:53:49Z","published":"2021-12-27T23:13:57Z","title":"RELDEC: Reinforcement Learning-Based Decoding of Moderate Length LDPC\n  Codes","summary":"  In this work we propose RELDEC, a novel approach for sequential decoding of\nmoderate length low-density parity-check (LDPC) codes. The main idea behind\nRELDEC is that an optimized decoding policy is subsequently obtained via\nreinforcement learning based on a Markov decision process (MDP). In contrast to\nour previous work, where an agent learns to schedule only a single check node\n(CN) within a group (cluster) of CNs per iteration, in this work we train the\nagent to schedule all CNs in a cluster, and all clusters in every iteration.\nThat is, in each learning step of RELDEC an agent learns to schedule CN\nclusters sequentially depending on a reward associated with the outcome of\nscheduling a particular cluster. We also modify the state space representation\nof the MDP, enabling RELDEC to be suitable for larger block length LDPC codes\nthan those studied in our previous work. Furthermore, to address decoding under\nvarying channel conditions, we propose agile meta-RELDEC (AM-RELDEC) that\nemploys meta-reinforcement learning. The proposed RELDEC scheme significantly\noutperforms standard flooding and random sequential decoding for a variety of\nLDPC codes, including codes designed for 5G new radio.\n","authors":["Salman Habib","Allison Beemer","Joerg Kliewer"],"pdf_url":"https://arxiv.org/pdf/2112.13934v3.pdf","comment":"Accepted for publication in IEEE Transactions on Communications"},{"id":"http://arxiv.org/abs/2307.14993v1","updated":"2023-07-27T16:40:14Z","published":"2023-07-27T16:40:14Z","title":"Thinker: Learning to Plan and Act","summary":"  We propose the Thinker algorithm, a novel approach that enables reinforcement\nlearning agents to autonomously interact with and utilize a learned world\nmodel. The Thinker algorithm wraps the environment with a world model and\nintroduces new actions designed for interacting with the world model. These\nmodel-interaction actions enable agents to perform planning by proposing\nalternative plans to the world model before selecting a final action to execute\nin the environment. This approach eliminates the need for hand-crafted planning\nalgorithms by enabling the agent to learn how to plan autonomously and allows\nfor easy interpretation of the agent's plan with visualization. We demonstrate\nthe algorithm's effectiveness through experimental results in the game of\nSokoban and the Atari 2600 benchmark, where the Thinker algorithm achieves\nstate-of-the-art performance and competitive results, respectively.\nVisualizations of agents trained with the Thinker algorithm demonstrate that\nthey have learned to plan effectively with the world model to select better\nactions. The algorithm's generality opens a new research direction on how a\nworld model can be used in reinforcement learning and how planning can be\nseamlessly integrated into an agent's decision-making process.\n","authors":["Stephen Chung","Ivan Anokhin","David Krueger"],"pdf_url":"https://arxiv.org/pdf/2307.14993v1.pdf","comment":"37 pages"},{"id":"http://arxiv.org/abs/2206.12672v2","updated":"2023-07-27T16:38:42Z","published":"2022-06-25T15:29:20Z","title":"Trace Recovery from Stochastically Known Logs","summary":"  In this work we propose an algorithm for trace recovery from stochastically\nknown logs, a setting that is becoming more common with the increasing number\nof sensors and predictive models that generate uncertain data. The suggested\napproach calculates the conformance between a process model and a\nstochastically known trace and recovers the best alignment within this\nstochastic trace as the true trace. The paper offers an analysis of the impact\nof various cost models on trace recovery accuracy and makes use of a product\nmulti-graph to compare alternative trace recovery options. The average accuracy\nof our approach, evaluated using two publicly available datasets, is\nimpressive, with an average recovery accuracy score of 90-97%, significantly\nimproving a common heuristic that chooses the most likely value for each\nuncertain activity. We believe that the effectiveness of the proposed algorithm\nin recovering correct traces from stochastically known logs may be a powerful\naid for developing credible decision-making tools in uncertain settings.\n","authors":["Eli Bogdanov","Izack Cohen","Avigdor Gal"],"pdf_url":"https://arxiv.org/pdf/2206.12672v2.pdf","comment":"Submitted version -- Accepted to the 5th International Conference on\n  Process Mining (ICPM), 2023"},{"id":"http://arxiv.org/abs/2307.14988v1","updated":"2023-07-27T16:30:27Z","published":"2023-07-27T16:30:27Z","title":"Incrementally-Computable Neural Networks: Efficient Inference for\n  Dynamic Inputs","summary":"  Deep learning often faces the challenge of efficiently processing dynamic\ninputs, such as sensor data or user inputs. For example, an AI writing\nassistant is required to update its suggestions in real time as a document is\nedited. Re-running the model each time is expensive, even with compression\ntechniques like knowledge distillation, pruning, or quantization. Instead, we\ntake an incremental computing approach, looking to reuse calculations as the\ninputs change. However, the dense connectivity of conventional architectures\nposes a major obstacle to incremental computation, as even minor input changes\ncascade through the network and restrict information reuse. To address this, we\nuse vector quantization to discretize intermediate values in the network, which\nfilters out noisy and unnecessary modifications to hidden neurons, facilitating\nthe reuse of their values. We apply this approach to the transformers\narchitecture, creating an efficient incremental inference algorithm with\ncomplexity proportional to the fraction of the modified inputs. Our experiments\nwith adapting the OPT-125M pre-trained language model demonstrate comparable\naccuracy on document classification while requiring 12.1X (median) fewer\noperations for processing sequences of atomic edits.\n","authors":["Or Sharir","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2307.14988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04169v2","updated":"2023-07-27T16:20:02Z","published":"2023-06-07T05:38:55Z","title":"Efficient Alternating Minimization with Applications to Weighted Low\n  Rank Approximation","summary":"  Weighted low rank approximation is a fundamental problem in numerical linear\nalgebra, and it has many applications in machine learning. Given a matrix $M\n\\in \\mathbb{R}^{n \\times n}$, a weight matrix $W \\in \\mathbb{R}_{\\geq 0}^{n\n\\times n}$, a parameter $k$, the goal is to output two matrices $U, V \\in\n\\mathbb{R}^{n \\times k}$ such that $\\| W \\circ (M - U V^\\top) \\|_F$ is\nminimized, where $\\circ$ denotes the Hadamard product. Such a problem is known\nto be NP-hard and even hard to approximate assuming Exponential Time Hypothesis\n[GG11, RSW16]. Meanwhile, alternating minimization is a good heuristic solution\nfor approximating weighted low rank approximation. The work [LLR16] shows that,\nunder mild assumptions, alternating minimization does provide provable\nguarantees. In this work, we develop an efficient and robust framework for\nalternating minimization. For weighted low rank approximation, this improves\nthe runtime of [LLR16] from $n^2 k^2$ to $n^2k$. At the heart of our work\nframework is a high-accuracy multiple response regression solver together with\na robust analysis of alternating minimization.\n","authors":["Zhao Song","Mingquan Ye","Junze Yin","Lichen Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.04169v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.01198v2","updated":"2023-07-27T16:11:42Z","published":"2023-02-02T16:25:16Z","title":"Causal Lifting and Link Prediction","summary":"  Existing causal models for link prediction assume an underlying set of\ninherent node factors -- an innate characteristic defined at the node's birth\n-- that governs the causal evolution of links in the graph. In some causal\ntasks, however, link formation is path-dependent: The outcome of link\ninterventions depends on existing links. Unfortunately, these existing causal\nmethods are not designed for path-dependent link formation, as the cascading\nfunctional dependencies between links (arising from path dependence) are either\nunidentifiable or require an impractical number of control variables. To\novercome this, we develop the first causal model capable of dealing with path\ndependencies in link prediction. In this work we introduce the concept of\ncausal lifting, an invariance in causal models of independent interest that, on\ngraphs, allows the identification of causal link prediction queries using\nlimited interventional data. Further, we show how structural pairwise\nembeddings exhibit lower bias and correctly represent the task's causal\nstructure, as opposed to existing node embeddings, e.g., graph neural network\nnode embeddings and matrix factorization. Finally, we validate our theoretical\nfindings on three scenarios for causal link prediction tasks: knowledge base\ncompletion, covariance matrix estimation and consumer-product recommendations.\n","authors":["Leonardo Cotta","Beatrice Bevilacqua","Nesreen Ahmed","Bruno Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2302.01198v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13849v2","updated":"2023-07-27T16:09:57Z","published":"2023-05-23T09:18:47Z","title":"Gaussian Latent Representations for Uncertainty Estimation using\n  Mahalanobis Distance in Deep Classifiers","summary":"  Recent works show that the data distribution in a network's latent space is\nuseful for estimating classification uncertainty and detecting\nOut-of-distribution (OOD) samples. To obtain a well-regularized latent space\nthat is conducive for uncertainty estimation, existing methods bring in\nsignificant changes to model architectures and training procedures. In this\npaper, we present a lightweight, fast, and high-performance regularization\nmethod for Mahalanobis distance-based uncertainty prediction, and that requires\nminimal changes to the network's architecture. To derive Gaussian latent\nrepresentation favourable for Mahalanobis Distance calculation, we introduce a\nself-supervised representation learning method that separates in-class\nrepresentations into multiple Gaussians. Classes with non-Gaussian\nrepresentations are automatically identified and dynamically clustered into\nmultiple new classes that are approximately Gaussian. Evaluation on standard\nOOD benchmarks shows that our method achieves state-of-the-art results on OOD\ndetection with minimal inference time, and is very competitive on predictive\nprobability calibration. Finally, we show the applicability of our method to a\nreal-life computer vision use case on microorganism classification.\n","authors":["Aishwarya Venkataramanan","Assia Benbihi","Martin Laviale","Cedric Pradalier"],"pdf_url":"https://arxiv.org/pdf/2305.13849v2.pdf","comment":"24 pages including supplementary material"},{"id":"http://arxiv.org/abs/2307.14971v1","updated":"2023-07-27T16:07:03Z","published":"2023-07-27T16:07:03Z","title":"Take-A-Photo: 3D-to-2D Generative Pre-training of Point Cloud Models","summary":"  With the overwhelming trend of mask image modeling led by MAE, generative\npre-training has shown a remarkable potential to boost the performance of\nfundamental models in 2D vision. However, in 3D vision, the over-reliance on\nTransformer-based backbones and the unordered nature of point clouds have\nrestricted the further development of generative pre-training. In this paper,\nwe propose a novel 3D-to-2D generative pre-training method that is adaptable to\nany point cloud model. We propose to generate view images from different\ninstructed poses via the cross-attention mechanism as the pre-training scheme.\nGenerating view images has more precise supervision than its point cloud\ncounterpart, thus assisting 3D backbones to have a finer comprehension of the\ngeometrical structure and stereoscopic relations of the point cloud.\nExperimental results have proved the superiority of our proposed 3D-to-2D\ngenerative pre-training over previous pre-training methods. Our method is also\neffective in boosting the performance of architecture-oriented approaches,\nachieving state-of-the-art performance when fine-tuning on ScanObjectNN\nclassification and ShapeNetPart segmentation tasks. Code is available at\nhttps://github.com/wangzy22/TAP.\n","authors":["Ziyi Wang","Xumin Yu","Yongming Rao","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2307.14971v1.pdf","comment":"Accepted to ICCV 2023, project page: https://tap.ivg-research.xyz"},{"id":"http://arxiv.org/abs/2307.14970v1","updated":"2023-07-27T16:06:03Z","published":"2023-07-27T16:06:03Z","title":"Learning locally dominant force balances in active particle systems","summary":"  We use a combination of unsupervised clustering and sparsity-promoting\ninference algorithms to learn locally dominant force balances that explain\nmacroscopic pattern formation in self-organized active particle systems. The\nself-organized emergence of macroscopic patterns from microscopic interactions\nbetween self-propelled particles can be widely observed nature. Although\nhydrodynamic theories help us better understand the physical basis of this\nphenomenon, identifying a sufficient set of local interactions that shape,\nregulate, and sustain self-organized structures in active particle systems\nremains challenging. We investigate a classic hydrodynamic model of\nself-propelled particles that produces a wide variety of patterns, like asters\nand moving density bands. Our data-driven analysis shows that propagating bands\nare formed by local alignment interactions driven by density gradients, while\nsteady-state asters are shaped by a mechanism of splay-induced negative\ncompressibility arising from strong particle interactions. Our method also\nreveals analogous physical principles of pattern formation in a system where\nthe speed of the particle is influenced by local density. This demonstrates the\nability of our method to reveal physical commonalities across models. The\nphysical mechanisms inferred from the data are in excellent agreement with\nanalytical scaling arguments and experimental observations.\n","authors":["Dominik Sturm","Suryanarayana Maddu","Ivo F. Sbalzarini"],"pdf_url":"https://arxiv.org/pdf/2307.14970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08424v3","updated":"2023-07-27T15:56:14Z","published":"2023-02-16T17:03:39Z","title":"From Contextual Data to Newsvendor Decisions: On the Actual Performance\n  of Data-Driven Algorithms","summary":"  In this work, we explore a framework for contextual decision-making to study\nhow the relevance and quantity of past data affects the performance of a\ndata-driven policy. We analyze a contextual Newsvendor problem in which a\ndecision-maker needs to trade-off between an underage and an overage cost in\nthe face of uncertain demand. We consider a setting in which past demands\nobserved under ``close by'' contexts come from close by distributions and\nanalyze the performance of data-driven algorithms through a notion of\ncontext-dependent worst-case expected regret. We analyze the broad class of\nWeighted Empirical Risk Minimization (WERM) policies which weigh past data\naccording to their similarity in the contextual space. This class includes\nclassical policies such as ERM, k-Nearest Neighbors and kernel-based policies.\nOur main methodological contribution is to characterize exactly the worst-case\nregret of any WERM policy on any given configuration of contexts. To the best\nof our knowledge, this provides the first understanding of tight performance\nguarantees in any contextual decision-making problem, with past literature\nfocusing on upper bounds via concentration inequalities. We instead take an\noptimization approach, and isolate a structure in the Newsvendor loss function\nthat allows to reduce the infinite-dimensional optimization problem over\nworst-case distributions to a simple line search.\n  This in turn allows us to unveil fundamental insights that were obfuscated by\nprevious general-purpose bounds. We characterize actual guaranteed performance\nas a function of the contexts, as well as granular insights on the learning\ncurve of algorithms.\n","authors":["Omar Besbes","Will Ma","Omar Mouchtaki"],"pdf_url":"https://arxiv.org/pdf/2302.08424v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14959v1","updated":"2023-07-27T15:52:18Z","published":"2023-07-27T15:52:18Z","title":"Federated Model Aggregation via Self-Supervised Priors for Highly\n  Imbalanced Medical Image Classification","summary":"  In the medical field, federated learning commonly deals with highly\nimbalanced datasets, including skin lesions and gastrointestinal images.\nExisting federated methods under highly imbalanced datasets primarily focus on\noptimizing a global model without incorporating the intra-class variations that\ncan arise in medical imaging due to different populations, findings, and\nscanners. In this paper, we study the inter-client intra-class variations with\npublicly available self-supervised auxiliary networks. Specifically, we find\nthat employing a shared auxiliary pre-trained model, like MoCo-V2, locally on\nevery client yields consistent divergence measurements. Based on these\nfindings, we derive a dynamic balanced model aggregation via self-supervised\npriors (MAS) to guide the global model optimization. Fed-MAS can be utilized\nwith different local learning methods for effective model aggregation toward a\nhighly robust and unbiased global model. Our code is available at\n\\url{https://github.com/xmed-lab/Fed-MAS}.\n","authors":["Marawan Elbatel","Hualiang Wang","Robert Martí","Huazhu Fu","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2307.14959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14953v1","updated":"2023-07-27T15:46:59Z","published":"2023-07-27T15:46:59Z","title":"Multi-Source Domain Adaptation through Dataset Dictionary Learning in\n  Wasserstein Space","summary":"  This paper seeks to solve Multi-Source Domain Adaptation (MSDA), which aims\nto mitigate data distribution shifts when transferring knowledge from multiple\nlabeled source domains to an unlabeled target domain. We propose a novel MSDA\nframework based on dictionary learning and optimal transport. We interpret each\ndomain in MSDA as an empirical distribution. As such, we express each domain as\na Wasserstein barycenter of dictionary atoms, which are empirical\ndistributions. We propose a novel algorithm, DaDiL, for learning via\nmini-batches: (i) atom distributions; (ii) a matrix of barycentric coordinates.\nBased on our dictionary, we propose two novel methods for MSDA: DaDil-R, based\non the reconstruction of labeled samples in the target domain, and DaDiL-E,\nbased on the ensembling of classifiers learned on atom distributions. We\nevaluate our methods in 3 benchmarks: Caltech-Office, Office 31, and CRWU,\nwhere we improved previous state-of-the-art by 3.15%, 2.29%, and 7.71% in\nclassification performance. Finally, we show that interpolations in the\nWasserstein hull of learned atoms provide data that can generalize to the\ntarget domain.\n","authors":["Eduardo Fernandes Montesuma","Fred Ngolè Mboula","Antoine Souloumiac"],"pdf_url":"https://arxiv.org/pdf/2307.14953v1.pdf","comment":"13 pages,9 figures,Accepted as a conference paper at the 26th\n  European Conference on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2307.14952v1","updated":"2023-07-27T15:46:46Z","published":"2023-07-27T15:46:46Z","title":"Network Fault-tolerant and Byzantine-resilient Social Learning via\n  Collaborative Hierarchical Non-Bayesian Learning","summary":"  As the network scale increases, existing fully distributed solutions start to\nlag behind the real-world challenges such as (1) slow information propagation,\n(2) network communication failures, and (3) external adversarial attacks. In\nthis paper, we focus on hierarchical system architecture and address the\nproblem of non-Bayesian learning over networks that are vulnerable to\ncommunication failures and adversarial attacks. On network communication, we\nconsider packet-dropping link failures.\n  We first propose a hierarchical robust push-sum algorithm that can achieve\naverage consensus despite frequent packet-dropping link failures. We provide a\nsparse information fusion rule between the parameter server and arbitrarily\nselected network representatives. Then, interleaving the consensus update step\nwith a dual averaging update with Kullback-Leibler (KL) divergence as the\nproximal function, we obtain a packet-dropping fault-tolerant non-Bayesian\nlearning algorithm with provable convergence guarantees.\n  On external adversarial attacks, we consider Byzantine attacks in which the\ncompromised agents can send maliciously calibrated messages to others\n(including both the agents and the parameter server). To avoid the curse of\ndimensionality of Byzantine consensus, we solve the non-Bayesian learning\nproblem via running multiple dynamics, each of which only involves Byzantine\nconsensus with scalar inputs. To facilitate resilient information propagation\nacross sub-networks, we use a novel Byzantine-resilient gossiping-type rule at\nthe parameter server.\n","authors":["Connor Mclaughlin","Matthew Ding","Denis Edogmus","Lili Su"],"pdf_url":"https://arxiv.org/pdf/2307.14952v1.pdf","comment":"11 pages, 1 figure"},{"id":"http://arxiv.org/abs/2307.14940v1","updated":"2023-07-27T15:32:02Z","published":"2023-07-27T15:32:02Z","title":"A Self-Adaptive Penalty Method for Integrating Prior Knowledge\n  Constraints into Neural ODEs","summary":"  The continuous dynamics of natural systems has been effectively modelled\nusing Neural Ordinary Differential Equations (Neural ODEs). However, for\naccurate and meaningful predictions, it is crucial that the models follow the\nunderlying rules or laws that govern these systems. In this work, we propose a\nself-adaptive penalty algorithm for Neural ODEs to enable modelling of\nconstrained natural systems. The proposed self-adaptive penalty function can\ndynamically adjust the penalty parameters. The explicit introduction of prior\nknowledge helps to increase the interpretability of Neural ODE -based models.\nWe validate the proposed approach by modelling three natural systems with prior\nknowledge constraints: population growth, chemical reaction evolution, and\ndamped harmonic oscillator motion. The numerical experiments and a comparison\nwith other penalty Neural ODE approaches and \\emph{vanilla} Neural ODE,\ndemonstrate the effectiveness of the proposed self-adaptive penalty algorithm\nfor Neural ODEs in modelling constrained natural systems. Moreover, the\nself-adaptive penalty approach provides more accurate and robust models with\nreliable and meaningful predictions.\n","authors":["C. Coelho","M. Fernanda P. Costa","L. L. Ferrás"],"pdf_url":"https://arxiv.org/pdf/2307.14940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14938v1","updated":"2023-07-27T15:30:22Z","published":"2023-07-27T15:30:22Z","title":"Efficient Interaction-Aware Interval Analysis of Neural Network Feedback\n  Loops","summary":"  In this paper, we propose a computationally efficient framework for interval\nreachability of neural network controlled systems. Our approach builds upon\ninclusion functions for the neural network controller and the open-loop system.\nWe observe that many state-of-the-art neural network verifiers can produce\ninclusion functions for neural networks. We introduce and analyze a new class\nof inclusion functions for the open-loop dynamics based on bounds of the\nfunction Jacobian that is particularly suitable for capturing the interactions\nbetween systems and neural network controllers. Next, for any dynamical system,\nwe use inclusion functions to construct an embedding system with twice the\nnumber of states as the original system. We show that a single trajectory of\nthis embedding system provides hyper-rectangular over-approximations of\nreachable sets. We then propose two approaches for constructing a closed-loop\nembedding system for a neural network controlled dynamical system that accounts\nfor the interaction between the system and the controller in different ways.\nThe interconnection-based approach accounts for the worst-case evolution of\neach coordinate separately by substituting the neural network inclusion\nfunction into the open-loop embedding system. The interaction-based approach\nuses the newly introduced class of Jacobian-based inclusion functions to fully\ncapture first-order interactions between the system and the controller.\nFinally, we implement our approach in a Python framework called\n\\texttt{ReachMM} and show that on several existing benchmarks, our methods\noutperform the existing approaches in the literature. We also demonstrate the\nscalability of our method on a vehicle platooning example with up to $200$\nstates.\n","authors":["Saber Jafarpour","Akash Harapanahalli","Samuel Coogan"],"pdf_url":"https://arxiv.org/pdf/2307.14938v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14936v1","updated":"2023-07-27T15:28:29Z","published":"2023-07-27T15:28:29Z","title":"PanGu-Coder2: Boosting Large Language Models for Code with Ranking\n  Feedback","summary":"  Large Language Models for Code (Code LLM) are flourishing. New and powerful\nmodels are released on a weekly basis, demonstrating remarkable performance on\nthe code generation task. Various approaches have been proposed to boost the\ncode generation performance of pre-trained Code LLMs, such as supervised\nfine-tuning, instruction tuning, reinforcement learning, etc. In this paper, we\npropose a novel RRTF (Rank Responses to align Test&Teacher Feedback) framework,\nwhich can effectively and efficiently boost pre-trained large language models\nfor code generation. Under this framework, we present PanGu-Coder2, which\nachieves 62.20% pass@1 on the OpenAI HumanEval benchmark. Furthermore, through\nan extensive evaluation on CoderEval and LeetCode benchmarks, we show that\nPanGu-Coder2 consistently outperforms all previous Code LLMs.\n","authors":["Bo Shen","Jiaxin Zhang","Taihong Chen","Daoguang Zan","Bing Geng","An Fu","Muhan Zeng","Ailun Yu","Jichuan Ji","Jingyang Zhao","Yuenan Guo","Qianxiang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14936v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2307.14935v1","updated":"2023-07-27T15:26:26Z","published":"2023-07-27T15:26:26Z","title":"Solving Data Quality Problems with Desbordante: a Demo","summary":"  Data profiling is an essential process in modern data-driven industries. One\nof its critical components is the discovery and validation of complex\nstatistics, including functional dependencies, data constraints, association\nrules, and others.\n  However, most existing data profiling systems that focus on complex\nstatistics do not provide proper integration with the tools used by\ncontemporary data scientists. This creates a significant barrier to the\nadoption of these tools in the industry. Moreover, existing systems were not\ncreated with industrial-grade workloads in mind. Finally, they do not aim to\nprovide descriptive explanations, i.e. why a given pattern is not found. It is\na significant issue as it is essential to understand the underlying reasons for\na specific pattern's absence to make informed decisions based on the data.\n  Because of that, these patterns are effectively rest in thin air: their\napplication scope is rather limited, they are rarely used by the broader\npublic. At the same time, as we are going to demonstrate in this presentation,\ncomplex statistics can be efficiently used to solve many classic data quality\nproblems.\n  Desbordante is an open-source data profiler that aims to close this gap. It\nis built with emphasis on industrial application: it is efficient, scalable,\nresilient to crashes, and provides explanations. Furthermore, it provides\nseamless Python integration by offloading various costly operations to the C++\ncore, not only mining.\n  In this demonstration, we show several scenarios that allow end users to\nsolve different data quality problems. Namely, we showcase typo detection, data\ndeduplication, and data anomaly detection scenarios.\n","authors":["George Chernishev","Michael Polyntsov","Anton Chizhov","Kirill Stupakov","Ilya Shchuckin","Alexander Smirnov","Maxim Strutovsky","Alexey Shlyonskikh","Mikhail Firsov","Stepan Manannikov","Nikita Bobrov","Daniil Goncharov","Ilia Barutkin","Vladislav Shalnev","Kirill Muraviev","Anna Rakhmukova","Dmitriy Shcheka","Anton Chernikov","Mikhail Vyrodov","Kurbatov Yaroslav","Maxim Fofanov","Belokonnyi Sergei","Anosov Pavel","Arthur Saliou","Eduard Gaisin","Kirill Smirnov"],"pdf_url":"https://arxiv.org/pdf/2307.14935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14928v1","updated":"2023-07-27T15:18:50Z","published":"2023-07-27T15:18:50Z","title":"Graph-based Polyphonic Multitrack Music Generation","summary":"  Graphs can be leveraged to model polyphonic multitrack symbolic music, where\nnotes, chords and entire sections may be linked at different levels of the\nmusical hierarchy by tonal and rhythmic relationships. Nonetheless, there is a\nlack of works that consider graph representations in the context of deep\nlearning systems for music generation. This paper bridges this gap by\nintroducing a novel graph representation for music and a deep Variational\nAutoencoder that generates the structure and the content of musical graphs\nseparately, one after the other, with a hierarchical architecture that matches\nthe structural priors of music. By separating the structure and content of\nmusical graphs, it is possible to condition generation by specifying which\ninstruments are played at certain times. This opens the door to a new form of\nhuman-computer interaction in the context of music co-creation. After training\nthe model on existing MIDI datasets, the experiments show that the model is\nable to generate appealing short and long musical sequences and to\nrealistically interpolate between them, producing music that is tonally and\nrhythmically consistent. Finally, the visualization of the embeddings shows\nthat the model is able to organize its latent space in accordance with known\nmusical concepts.\n","authors":["Emanuele Cosenza","Andrea Valenti","Davide Bacciu"],"pdf_url":"https://arxiv.org/pdf/2307.14928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.03124v3","updated":"2023-07-27T15:18:02Z","published":"2022-08-01T14:28:10Z","title":"Learning Transfer Operators by Kernel Density Estimation","summary":"  Inference of transfer operators from data is often formulated as a classical\nproblem that hinges on the Ulam method. The conventional description, known as\nthe Ulam-Galerkin method, involves projecting onto basis functions represented\nas characteristic functions supported over a fine grid of rectangles. From this\nperspective, the Ulam-Galerkin approach can be interpreted as density\nestimation using the histogram method. In this study, we recast the problem\nwithin the framework of statistical density estimation. This alternative\nperspective allows for an explicit and rigorous analysis of bias and variance,\nthereby facilitating a discussion on the mean square error. Through\ncomprehensive examples utilizing the logistic map and a Markov map, we\ndemonstrate the validity and effectiveness of this approach in estimating the\neigenvectors of the Frobenius-Perron operator. We compare the performance of\nHistogram Density Estimation(HDE) and Kernel Density Estimation(KDE) methods\nand find that KDE generally outperforms HDE in terms of accuracy. However, it\nis important to note that KDE exhibits limitations around boundary points and\njumps. Based on our research findings, we suggest the possibility of\nincorporating other density estimation methods into this field and propose\nfuture investigations into the application of KDE-based estimation for\nhigh-dimensional maps. These findings provide valuable insights for researchers\nand practitioners working on estimating the Frobenius-Perron operator and\nhighlight the potential of density estimation techniques in this area of study.\n  Keywords: Transfer Operators; Frobenius-Perron operator; probability density\nestimation; Ulam-Galerkin method; Kernel Density Estimation; Histogram Density\nEstimation.\n","authors":["Sudam Surasinghe","Jeremie Fish","Erik M. Bollt"],"pdf_url":"https://arxiv.org/pdf/2210.03124v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10135v4","updated":"2023-07-27T15:14:03Z","published":"2023-03-17T17:23:14Z","title":"Efficient and Feasible Robotic Assembly Sequence Planning via Graph\n  Representation Learning","summary":"  Automatic Robotic Assembly Sequence Planning (RASP) can significantly improve\nproductivity and resilience in modern manufacturing along with the growing need\nfor greater product customization. One of the main challenges in realizing such\nautomation resides in efficiently finding solutions from a growing number of\npotential sequences for increasingly complex assemblies. Besides, costly\nfeasibility checks are always required for the robotic system. To address this,\nwe propose a holistic graphical approach including a graph representation\ncalled Assembly Graph for product assemblies and a policy architecture, Graph\nAssembly Processing Network, dubbed GRACE for assembly sequence generation.\nWith GRACE, we are able to extract meaningful information from the graph input\nand predict assembly sequences in a step-by-step manner. In experiments, we\nshow that our approach can predict feasible assembly sequences across product\nvariants of aluminum profiles based on data collected in simulation of a\ndual-armed robotic system. We further demonstrate that our method is capable of\ndetecting infeasible assemblies, substantially alleviating the undesirable\nimpacts from false predictions, and hence facilitating real-world deployment\nsoon. Code and training data are available at https://github.com/DLR-RM/GRACE.\n","authors":["Matan Atad","Jianxiang Feng","Ismael Rodríguez","Maximilian Durner","Rudolph Triebel"],"pdf_url":"https://arxiv.org/pdf/2303.10135v4.pdf","comment":"Accepted to IROS 2023. First two authors share equal contribution"},{"id":"http://arxiv.org/abs/2307.14921v1","updated":"2023-07-27T15:03:13Z","published":"2023-07-27T15:03:13Z","title":"Benchmarking Performance of Deep Learning Model for Material\n  Segmentation on Two HPC Systems","summary":"  Performance Benchmarking of HPC systems is an ongoing effort that seeks to\nprovide information that will allow for increased performance and improve the\njob schedulers that manage these systems. We develop a benchmarking tool that\nutilizes machine learning models and gathers performance data on\nGPU-accelerated nodes while they perform material segmentation analysis. The\nbenchmark uses a ML model that has been converted from Caffe to PyTorch using\nthe MMdnn toolkit and the MINC-2500 dataset. Performance data is gathered on\ntwo ERDC DSRC systems, Onyx and Vulcanite. The data reveals that while\nVulcanite has faster model times in a large number of benchmarks, and it is\nalso more subject to some environmental factors that can cause performances\nslower than Onyx. In contrast the model times from Onyx are consistent across\nbenchmarks.\n","authors":["Warren R. Williams","S. Ross Glandon","Luke L. Morris","Jing-Ru C. Cheng"],"pdf_url":"https://arxiv.org/pdf/2307.14921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14917v1","updated":"2023-07-27T15:00:31Z","published":"2023-07-27T15:00:31Z","title":"NSA: Naturalistic Support Artifact to Boost Network Confidence","summary":"  Visual AI systems are vulnerable to natural and synthetic physical corruption\nin the real-world. Such corruption often arises unexpectedly and alters the\nmodel's performance. In recent years, the primary focus has been on adversarial\nattacks. However, natural corruptions (e.g., snow, fog, dust) are an\nomnipresent threat to visual AI systems and should be considered equally\nimportant. Many existing works propose interesting solutions to train robust\nmodels against natural corruption. These works either leverage image\naugmentations, which come with the additional cost of model training, or place\nsuspicious patches in the scene to design unadversarial examples. In this work,\nwe propose the idea of naturalistic support artifacts (NSA) for robust\nprediction. The NSAs are shown to be beneficial in scenarios where model\nparameters are inaccessible and adding artifacts in the scene is feasible. The\nNSAs are natural looking objects generated through artifact training using\nDC-GAN to have high visual fidelity in the scene. We test against natural\ncorruptions on the Imagenette dataset and observe the improvement in prediction\nconfidence score by four times. We also demonstrate NSA's capability to\nincrease adversarial accuracy by 8\\% on average. Lastly, we qualitatively\nanalyze NSAs using saliency maps to understand how they help improve prediction\nconfidence.\n","authors":["Abhijith Sharma","Phil Munz","Apurva Narayan"],"pdf_url":"https://arxiv.org/pdf/2307.14917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00610v2","updated":"2023-07-27T14:54:13Z","published":"2023-07-02T16:35:54Z","title":"Fraunhofer SIT at CheckThat! 2023: Mixing Single-Modal Classifiers to\n  Estimate the Check-Worthiness of Multi-Modal Tweets","summary":"  The option of sharing images, videos and audio files on social media opens up\nnew possibilities for distinguishing between false information and fake news on\nthe Internet. Due to the vast amount of data shared every second on social\nmedia, not all data can be verified by a computer or a human expert. Here, a\ncheck-worthiness analysis can be used as a first step in the fact-checking\npipeline and as a filtering mechanism to improve efficiency. This paper\nproposes a novel way of detecting the check-worthiness in multi-modal tweets.\nIt takes advantage of two classifiers, each trained on a single modality. For\nimage data, extracting the embedded text with an OCR analysis has shown to\nperform best. By combining the two classifiers, the proposed solution was able\nto place first in the CheckThat! 2023 Task 1A with an F1 score of 0.7297\nachieved on the private test set.\n","authors":["Raphael Frick","Inna Vogel"],"pdf_url":"https://arxiv.org/pdf/2307.00610v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2307.14906v1","updated":"2023-07-27T14:47:38Z","published":"2023-07-27T14:47:38Z","title":"Scaling Session-Based Transformer Recommendations using Optimized\n  Negative Sampling and Loss Functions","summary":"  This work introduces TRON, a scalable session-based Transformer Recommender\nusing Optimized Negative-sampling. Motivated by the scalability and performance\nlimitations of prevailing models such as SASRec and GRU4Rec+, TRON integrates\ntop-k negative sampling and listwise loss functions to enhance its\nrecommendation accuracy. Evaluations on relevant large-scale e-commerce\ndatasets show that TRON improves upon the recommendation quality of current\nmethods while maintaining training speeds similar to SASRec. A live A/B test\nyielded an 18.14% increase in click-through rate over SASRec, highlighting the\npotential of TRON in practical settings. For further research, we provide\naccess to our source code at https://github.com/otto-de/TRON and an anonymized\ndataset at https://github.com/otto-de/recsys-dataset.\n","authors":["Timo Wilm","Philipp Normann","Sophie Baumeister","Paul-Vincent Kobow"],"pdf_url":"https://arxiv.org/pdf/2307.14906v1.pdf","comment":"Accepted at the Seventeenth ACM Conference on Recommender Systems\n  (RecSys '23)"},{"id":"http://arxiv.org/abs/2307.14902v1","updated":"2023-07-27T14:46:09Z","published":"2023-07-27T14:46:09Z","title":"CodeLens: An Interactive Tool for Visualizing Code Representations","summary":"  Representing source code in a generic input format is crucial to automate\nsoftware engineering tasks, e.g., applying machine learning algorithms to\nextract information. Visualizing code representations can further enable human\nexperts to gain an intuitive insight into the code. Unfortunately, as of today,\nthere is no universal tool that can simultaneously visualise different types of\ncode representations. In this paper, we introduce a tool, CodeLens, which\nprovides a visual interaction environment that supports various representation\nmethods and helps developers understand and explore them. CodeLens is designed\nto support multiple programming languages, such as Java, Python, and\nJavaScript, and four types of code representations, including sequence of\ntokens, abstract syntax tree (AST), data flow graph (DFG), and control flow\ngraph (CFG). By using CodeLens, developers can quickly visualize the specific\ncode representation and also obtain the represented inputs for models of code.\nThe Web-based interface of CodeLens is available at http://www.codelens.org.\nThe demonstration video can be found at http://www.codelens.org/demo.\n","authors":["Yuejun Guo","Seifeddine Bettaieb","Qiang Hu","Yves Le Traon","Qiang Tang"],"pdf_url":"https://arxiv.org/pdf/2307.14902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02377v2","updated":"2023-07-27T14:43:56Z","published":"2023-07-03T09:27:46Z","title":"Fraunhofer SIT at CheckThat! 2023: Tackling Classification Uncertainty\n  Using Model Souping on the Example of Check-Worthiness Classification","summary":"  This paper describes the second-placed approach developed by the Fraunhofer\nSIT team in the CLEF-2023 CheckThat! lab Task 1B for English. Given a text\nsnippet from a political debate, the aim of this task is to determine whether\nit should be assessed for check-worthiness. Detecting check-worthy statements\naims to facilitate manual fact-checking efforts by prioritizing the claims that\nfact-checkers should consider first. It can also be considered as primary step\nof a fact-checking system. Our best-performing method took advantage of an\nensemble classification scheme centered on Model Souping. When applied to the\nEnglish data set, our submitted model achieved an overall F1 score of 0.878 and\nwas ranked as the second-best model in the competition.\n","authors":["Raphael Frick","Inna Vogel","Jeong-Eun Choi"],"pdf_url":"https://arxiv.org/pdf/2307.02377v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2307.03430v2","updated":"2023-07-27T14:36:29Z","published":"2023-07-07T07:23:56Z","title":"Differential Privacy for Clustering Under Continual Observation","summary":"  We consider the problem of clustering privately a dataset in $\\mathbb{R}^d$\nthat undergoes both insertion and deletion of points. Specifically, we give an\n$\\varepsilon$-differentially private clustering mechanism for the $k$-means\nobjective under continual observation. This is the first approximation\nalgorithm for that problem with an additive error that depends only\nlogarithmically in the number $T$ of updates. The multiplicative error is\nalmost the same as non privately. To do so we show how to perform dimension\nreduction under continual observation and combine it with a differentially\nprivate greedy approximation algorithm for $k$-means. We also partially extend\nour results to the $k$-median problem.\n","authors":["Max Dupré la Tour","Monika Henzinger","David Saulpic"],"pdf_url":"https://arxiv.org/pdf/2307.03430v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.12481v2","updated":"2023-07-27T14:06:50Z","published":"2022-06-24T19:43:33Z","title":"Analyzing Explainer Robustness via Lipschitzness of Prediction Functions","summary":"  Machine learning methods have significantly improved in their predictive\ncapabilities, but at the same time they are becoming more complex and less\ntransparent. As a result, explainers are often relied on to provide\ninterpretability to these black-box prediction models. As crucial diagnostics\ntools, it is important that these explainers themselves are robust. In this\npaper we focus on one particular aspect of robustness, namely that an explainer\nshould give similar explanations for similar data inputs. We formalize this\nnotion by introducing and defining explainer astuteness, analogous to\nastuteness of prediction functions. Our formalism allows us to connect\nexplainer robustness to the predictor's probabilistic Lipschitzness, which\ncaptures the probability of local smoothness of a function. We provide lower\nbound guarantees on the astuteness of a variety of explainers (e.g., SHAP,\nRISE, CXPlain) given the Lipschitzness of the prediction function. These\ntheoretical results imply that locally smooth prediction functions lend\nthemselves to locally robust explanations. We evaluate these results\nempirically on simulated as well as real datasets.\n","authors":["Zulqarnain Khan","Davin Hill","Aria Masoomi","Joshua Bone","Jennifer Dy"],"pdf_url":"https://arxiv.org/pdf/2206.12481v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14857v1","updated":"2023-07-27T13:37:29Z","published":"2023-07-27T13:37:29Z","title":"Generative convective parametrization of dry atmospheric boundary layer","summary":"  Turbulence parametrizations will remain a necessary building block in\nkilometer-scale Earth system models. In convective boundary layers, where the\nmean vertical gradients of conserved properties such as potential temperature\nand moisture are approximately zero, the standard ansatz which relates\nturbulent fluxes to mean vertical gradients via an eddy diffusivity has to be\nextended by mass flux parametrizations for the typically asymmetric up- and\ndowndrafts in the atmospheric boundary layer. In this work, we present a\nparametrization for a dry convective boundary layer based on a generative\nadversarial network. The model incorporates the physics of self-similar layer\ngrowth following from the classical mixed layer theory by Deardorff. This\nenhances the training data base of the generative machine learning algorithm\nand thus significantly improves the predicted statistics of the synthetically\ngenerated turbulence fields at different heights inside the boundary layer. The\nalgorithm training is based on fully three-dimensional direct numerical\nsimulation data. Differently to stochastic parametrizations, our model is able\nto predict the highly non-Gaussian transient statistics of buoyancy\nfluctuations, vertical velocity, and buoyancy flux at different heights thus\nalso capturing the fastest thermals penetrating into the stabilized top region.\nThe results of our generative algorithm agree with standard two-equation or\nmulti-plume stochastic mass-flux schemes. The present parametrization provides\nadditionally the granule-type horizontal organization of the turbulent\nconvection which cannot be obtained in any of the other model closures. Our\nwork paves the way to efficient data-driven convective parametrizations in\nother natural flows, such as moist convection, upper ocean mixing, or\nconvection in stellar interiors.\n","authors":["Florian Heyder","Juan Pedro Mellado","Jörg Schumacher"],"pdf_url":"https://arxiv.org/pdf/2307.14857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14849v1","updated":"2023-07-27T13:28:18Z","published":"2023-07-27T13:28:18Z","title":"Counterfactual Explanations for Graph Classification Through the Lenses\n  of Density","summary":"  Counterfactual examples have emerged as an effective approach to produce\nsimple and understandable post-hoc explanations. In the context of graph\nclassification, previous work has focused on generating counterfactual\nexplanations by manipulating the most elementary units of a graph, i.e.,\nremoving an existing edge, or adding a non-existing one. In this paper, we\nclaim that such language of explanation might be too fine-grained, and turn our\nattention to some of the main characterizing features of real-world complex\nnetworks, such as the tendency to close triangles, the existence of recurring\nmotifs, and the organization into dense modules. We thus define a general\ndensity-based counterfactual search framework to generate instance-level\ncounterfactual explanations for graph classifiers, which can be instantiated\nwith different notions of dense substructures. In particular, we show two\nspecific instantiations of this general framework: a method that searches for\ncounterfactual graphs by opening or closing triangles, and a method driven by\nmaximal cliques. We also discuss how the general method can be instantiated to\nexploit any other notion of dense substructures, including, for instance, a\ngiven taxonomy of nodes. We evaluate the effectiveness of our approaches in 7\nbrain network datasets and compare the counterfactual statements generated\naccording to several widely-used metrics. Results confirm that adopting a\nsemantic-relevant unit of change like density is essential to define versatile\nand interpretable counterfactual explanation methods.\n","authors":["Carlo Abrate","Giulia Preti","Francesco Bonchi"],"pdf_url":"https://arxiv.org/pdf/2307.14849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14839v1","updated":"2023-07-27T13:18:52Z","published":"2023-07-27T13:18:52Z","title":"Kernelised Normalising Flows","summary":"  Normalising Flows are generative models characterised by their invertible\narchitecture. However, the requirement of invertibility imposes constraints on\ntheir expressiveness, necessitating a large number of parameters and innovative\narchitectural designs to achieve satisfactory outcomes. Whilst flow-based\nmodels predominantly rely on neural-network-based transformations for\nexpressive designs, alternative transformation methods have received limited\nattention. In this work, we present Ferumal flow, a novel kernelised\nnormalising flow paradigm that integrates kernels into the framework. Our\nresults demonstrate that a kernelised flow can yield competitive or superior\nresults compared to neural network-based flows whilst maintaining parameter\nefficiency. Kernelised flows excel especially in the low-data regime, enabling\nflexible non-parametric density estimation in applications with sparse data\navailability.\n","authors":["Eshant English","Matthias Kirchler","Christoph Lippert"],"pdf_url":"https://arxiv.org/pdf/2307.14839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.13624v2","updated":"2023-07-27T13:13:11Z","published":"2021-08-31T05:28:42Z","title":"Towards Out-Of-Distribution Generalization: A Survey","summary":"  Traditional machine learning paradigms are based on the assumption that both\ntraining and test data follow the same statistical pattern, which is\nmathematically referred to as Independent and Identically Distributed\n($i.i.d.$). However, in real-world applications, this $i.i.d.$ assumption often\nfails to hold due to unforeseen distributional shifts, leading to considerable\ndegradation in model performance upon deployment. This observed discrepancy\nindicates the significance of investigating the Out-of-Distribution (OOD)\ngeneralization problem. OOD generalization is an emerging topic of machine\nlearning research that focuses on complex scenarios wherein the distributions\nof the test data differ from those of the training data. This paper represents\nthe first comprehensive, systematic review of OOD generalization, encompassing\na spectrum of aspects from problem definition, methodological development, and\nevaluation procedures, to the implications and future directions of the field.\nOur discussion begins with a precise, formal characterization of the OOD\ngeneralization problem. Following that, we categorize existing methodologies\ninto three segments: unsupervised representation learning, supervised model\nlearning, and optimization, according to their positions within the overarching\nlearning process. We provide an in-depth discussion on representative\nmethodologies for each category, further elucidating the theoretical links\nbetween them. Subsequently, we outline the prevailing benchmark datasets\nemployed in OOD generalization studies. To conclude, we overview the existing\nbody of work in this domain and suggest potential avenues for future research\non OOD generalization. A summary of the OOD generalization methodologies\nsurveyed in this paper can be accessed at\nhttp://out-of-distribution-generalization.com.\n","authors":["Jiashuo Liu","Zheyan Shen","Yue He","Xingxuan Zhang","Renzhe Xu","Han Yu","Peng Cui"],"pdf_url":"https://arxiv.org/pdf/2108.13624v2.pdf","comment":"51 pages"},{"id":"http://arxiv.org/abs/2307.14823v1","updated":"2023-07-27T13:00:21Z","published":"2023-07-27T13:00:21Z","title":"Fading memory as inductive bias in residual recurrent networks","summary":"  Residual connections have been proposed as architecture-based inductive bias\nto mitigate the problem of exploding and vanishing gradients and increase task\nperformance in both feed-forward and recurrent networks (RNNs) when trained\nwith the backpropagation algorithm. Yet, little is known about how residual\nconnections in RNNs influence their dynamics and fading memory properties.\nHere, we introduce weakly coupled residual recurrent networks (WCRNNs) in which\nresidual connections result in well-defined Lyapunov exponents and allow for\nstudying properties of fading memory. We investigate how the residual\nconnections of WCRNNs influence their performance, network dynamics, and memory\nproperties on a set of benchmark tasks. We show that several distinct forms of\nresidual connections yield effective inductive biases that result in increased\nnetwork expressivity. In particular, residual connections that (i) result in\nnetwork dynamics at the proximity of the edge of chaos, (ii) allow networks to\ncapitalize on characteristic spectral properties of the data, and (iii) result\nin heterogeneous memory properties are shown to increase practical\nexpressivity. In addition, we demonstrate how our results can be extended to\nnon-linear residuals and introduce a weakly coupled residual initialization\nscheme that can be used for Elman RNNs\n","authors":["Igor Dubinin","Felix Effenberger"],"pdf_url":"https://arxiv.org/pdf/2307.14823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1912.13122v7","updated":"2023-07-27T12:22:27Z","published":"2019-12-31T00:10:50Z","title":"Towards Regulated Deep Learning","summary":"  Regulation of Multi-Agent Systems (MAS) and Declarative Electronic\nInstitutions (DEIs) was a multidisciplinary research topic of the past decade\ninvolving (Physical and Software) Agents and Law since the beginning, but\nrecently evolved towards News-claimed Robot Lawyer since 2016. One of these\nfirst proposals of restricting the behaviour of Software Agents was Electronic\nInstitutions.However, with the recent reformulation of Artificial Neural\nNetworks (ANNs) as Deep Learning (DL), Security, Privacy,Ethical and Legal\nissues regarding the use of DL has raised concerns in the Artificial\nIntelligence (AI) Community. Now that the Regulation of MAS is almost correctly\naddressed, we propose the Regulation of Artificial Neural Networks as\nAgent-based Training of a special type of regulated Artificial Neural Network\nthat we call Institutional Neural Network (INN).The main purpose of this paper\nis to bring attention to Artificial Teaching (AT) and to give a tentative\nanswer showing a proof-of-concept implementation of Regulated Deep Learning\n(RDL). This paper introduces the former concept and provide $I^*$, a language\npreviously used to model declaratively and extend Electronic Institutions, as a\nmeans to regulate the execution of Artificial Neural Networks and their\ninteractions with Artificial Teachers (ATs)\n","authors":["Andrés García-Camino"],"pdf_url":"https://arxiv.org/pdf/1912.13122v7.pdf","comment":"In this version I added Goal Alignment"},{"id":"http://arxiv.org/abs/2107.11277v2","updated":"2023-07-27T11:41:04Z","published":"2021-07-23T14:43:56Z","title":"Machine Learning with a Reject Option: A survey","summary":"  Machine learning models always make a prediction, even when it is likely to\nbe inaccurate. This behavior should be avoided in many decision support\napplications, where mistakes can have severe consequences. Albeit already\nstudied in 1970, machine learning with rejection recently gained interest. This\nmachine learning subfield enables machine learning models to abstain from\nmaking a prediction when likely to make a mistake.\n  This survey aims to provide an overview on machine learning with rejection.\nWe introduce the conditions leading to two types of rejection, ambiguity and\nnovelty rejection, which we carefully formalize. Moreover, we review and\ncategorize strategies to evaluate a model's predictive and rejective quality.\nAdditionally, we define the existing architectures for models with rejection\nand describe the standard techniques for learning such models. Finally, we\nprovide examples of relevant application domains and show how machine learning\nwith rejection relates to other machine learning research areas.\n","authors":["Kilian Hendrickx","Lorenzo Perini","Dries Van der Plas","Wannes Meert","Jesse Davis"],"pdf_url":"https://arxiv.org/pdf/2107.11277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14788v1","updated":"2023-07-27T11:29:57Z","published":"2023-07-27T11:29:57Z","title":"Likely, Light, and Accurate Context-Free Clusters-based Trajectory\n  Prediction","summary":"  Autonomous systems in the road transportation network require intelligent\nmechanisms that cope with uncertainty to foresee the future. In this paper, we\npropose a multi-stage probabilistic approach for trajectory forecasting:\ntrajectory transformation to displacement space, clustering of displacement\ntime series, trajectory proposals, and ranking proposals. We introduce a new\ndeep feature clustering method, underlying self-conditioned GAN, which copes\nbetter with distribution shifts than traditional methods. Additionally, we\npropose novel distance-based ranking proposals to assign probabilities to the\ngenerated trajectories that are more efficient yet accurate than an auxiliary\nneural network. The overall system surpasses context-free deep generative\nmodels in human and road agents trajectory data while performing similarly to\npoint estimators when comparing the most probable trajectory.\n","authors":["Tiago Rodrigues de Almeida","Oscar Martinez Mozos"],"pdf_url":"https://arxiv.org/pdf/2307.14788v1.pdf","comment":"This paper has been accepted to the 26th IEEE International\n  Conference on Intelligent Transportation Systems (ITSC 2023), which will be\n  held in Bilbao, Spain on September 24-28, 2023"},{"id":"http://arxiv.org/abs/2307.14783v1","updated":"2023-07-27T11:24:47Z","published":"2023-07-27T11:24:47Z","title":"Emotion4MIDI: a Lyrics-based Emotion-Labeled Symbolic Music Dataset","summary":"  We present a new large-scale emotion-labeled symbolic music dataset\nconsisting of 12k MIDI songs. To create this dataset, we first trained emotion\nclassification models on the GoEmotions dataset, achieving state-of-the-art\nresults with a model half the size of the baseline. We then applied these\nmodels to lyrics from two large-scale MIDI datasets. Our dataset covers a wide\nrange of fine-grained emotions, providing a valuable resource to explore the\nconnection between music and emotions and, especially, to develop models that\ncan generate music based on specific emotions. Our code for inference, trained\nmodels, and datasets are available online.\n","authors":["Serkan Sulun","Pedro Oliveira","Paula Viana"],"pdf_url":"https://arxiv.org/pdf/2307.14783v1.pdf","comment":"Accepted to 22nd EPIA Conference on Artificial Intelligence (2023)"},{"id":"http://arxiv.org/abs/2307.14778v1","updated":"2023-07-27T11:14:11Z","published":"2023-07-27T11:14:11Z","title":"MATNilm: Multi-appliance-task Non-intrusive Load Monitoring with Limited\n  Labeled Data","summary":"  Non-intrusive load monitoring (NILM) identifies the status and power\nconsumption of various household appliances by disaggregating the total power\nusage signal of an entire house. Efficient and accurate load monitoring\nfacilitates user profile establishment, intelligent household energy\nmanagement, and peak load shifting. This is beneficial for both the end-users\nand utilities by improving the overall efficiency of a power distribution\nnetwork. Existing approaches mainly focus on developing an individual model for\neach appliance. Those approaches typically rely on a large amount of\nhousehold-labeled data which is hard to collect. In this paper, we propose a\nmulti-appliance-task framework with a training-efficient sample augmentation\n(SA) scheme that boosts the disaggregation performance with limited labeled\ndata. For each appliance, we develop a shared-hierarchical split structure for\nits regression and classification tasks. In addition, we also propose a\ntwo-dimensional attention mechanism in order to capture spatio-temporal\ncorrelations among all appliances. With only one-day training data and limited\nappliance operation profiles, the proposed SA algorithm can achieve comparable\ntest performance to the case of training with the full dataset. Finally,\nsimulation results show that our proposed approach features a significantly\nimproved performance over many baseline models. The relative errors can be\nreduced by more than 50\\% on average. The codes of this work are available at\nhttps://github.com/jxiong22/MATNilm\n","authors":["Jing Xiong","Tianqi Hong","Dongbo Zhao","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.14778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13709v2","updated":"2023-07-27T10:37:41Z","published":"2023-07-24T20:56:42Z","title":"Deep Bradley-Terry Rating: Estimate Properties Without Metric of Unseen\n  Items","summary":"  Many properties in the real world, such as desirability or strength in\ncompetitive environment, can't be directly observed, which makes them difficult\nto evaluate. To deal with this challenging problem, prior works have primarily\nfocused on estimating those properties of known items, especially the strength\nof sports players, only of those who appears in paired comparison dataset. In\nthis paper, we introduce Deep Bradley-Terry Rating (DBTR), a novel ML framework\nto evaluate any properties of unknown items, not necessarily present in the\ntraining data. Our method seamlessly integrates traditional Bradley-Terry model\nwith a neural network structure. We also generalizes this architecture further\nfor asymmetric environment with unfairness, which is much more common in real\nworld settings. In our experimental analysis, DBTR successfully learned desired\nquantification of those properties.\n","authors":["Satoru Fujii"],"pdf_url":"https://arxiv.org/pdf/2307.13709v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14758v1","updated":"2023-07-27T10:33:54Z","published":"2023-07-27T10:33:54Z","title":"Towards Practicable Sequential Shift Detectors","summary":"  There is a growing awareness of the harmful effects of distribution shift on\nthe performance of deployed machine learning models. Consequently, there is a\ngrowing interest in detecting these shifts before associated costs have time to\naccumulate. However, desiderata of crucial importance to the practicable\ndeployment of sequential shift detectors are typically overlooked by existing\nworks, precluding their widespread adoption. We identify three such desiderata,\nhighlight existing works relevant to their satisfaction, and recommend\nimpactful directions for future research.\n","authors":["Oliver Cobb","Arnaud Van Looveren"],"pdf_url":"https://arxiv.org/pdf/2307.14758v1.pdf","comment":"ICML 2022 Workshop on Principles of Distribution Shift (PODS)"},{"id":"http://arxiv.org/abs/2307.14754v1","updated":"2023-07-27T10:26:46Z","published":"2023-07-27T10:26:46Z","title":"Fair Machine Unlearning: Data Removal while Mitigating Disparities","summary":"  As public consciousness regarding the collection and use of personal\ninformation by corporations grows, it is of increasing importance that\nconsumers be active participants in the curation of corporate datasets. In\nlight of this, data governance frameworks such as the General Data Protection\nRegulation (GDPR) have outlined the right to be forgotten as a key principle\nallowing individuals to request that their personal data be deleted from the\ndatabases and models used by organizations. To achieve forgetting in practice,\nseveral machine unlearning methods have been proposed to address the\ncomputational inefficiencies of retraining a model from scratch with each\nunlearning request. While efficient online alternatives to retraining, it is\nunclear how these methods impact other properties critical to real-world\napplications, such as fairness. In this work, we propose the first fair machine\nunlearning method that can provably and efficiently unlearn data instances\nwhile preserving group fairness. We derive theoretical results which\ndemonstrate that our method can provably unlearn data instances while\nmaintaining fairness objectives. Extensive experimentation with real-world\ndatasets highlight the efficacy of our method at unlearning data instances\nwhile preserving fairness.\n","authors":["Alex Oesterling","Jiaqi Ma","Flavio P. Calmon","Hima Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2307.14754v1.pdf","comment":"27 pages, 3 figures, accepted to ICML 2023 DMLR Workshop"},{"id":"http://arxiv.org/abs/2307.14066v2","updated":"2023-07-27T10:22:56Z","published":"2023-07-26T09:33:24Z","title":"Pre-Training with Diffusion models for Dental Radiography segmentation","summary":"  Medical radiography segmentation, and specifically dental radiography, is\nhighly limited by the cost of labeling which requires specific expertise and\nlabor-intensive annotations. In this work, we propose a straightforward\npre-training method for semantic segmentation leveraging Denoising Diffusion\nProbabilistic Models (DDPM), which have shown impressive results for generative\nmodeling. Our straightforward approach achieves remarkable performance in terms\nof label efficiency and does not require architectural modifications between\npre-training and downstream tasks. We propose to first pre-train a Unet by\nexploiting the DDPM training objective, and then fine-tune the resulting model\non a segmentation task. Our experimental results on the segmentation of dental\nradiographs demonstrate that the proposed method is competitive with\nstate-of-the-art pre-training methods.\n","authors":["Jérémy Rousseau","Christian Alaka","Emma Covili","Hippolyte Mayard","Laura Misrachi","Willy Au"],"pdf_url":"https://arxiv.org/pdf/2307.14066v2.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.14751v1","updated":"2023-07-27T10:19:10Z","published":"2023-07-27T10:19:10Z","title":"FLARE: Fingerprinting Deep Reinforcement Learning Agents using Universal\n  Adversarial Masks","summary":"  We propose FLARE, the first fingerprinting mechanism to verify whether a\nsuspected Deep Reinforcement Learning (DRL) policy is an illegitimate copy of\nanother (victim) policy. We first show that it is possible to find\nnon-transferable, universal adversarial masks, i.e., perturbations, to generate\nadversarial examples that can successfully transfer from a victim policy to its\nmodified versions but not to independently trained policies. FLARE employs\nthese masks as fingerprints to verify the true ownership of stolen DRL policies\nby measuring an action agreement value over states perturbed via such masks.\nOur empirical evaluations show that FLARE is effective (100% action agreement\non stolen copies) and does not falsely accuse independent policies (no false\npositives). FLARE is also robust to model modification attacks and cannot be\neasily evaded by more informed adversaries without negatively impacting agent\nperformance. We also show that not all universal adversarial masks are suitable\ncandidates for fingerprints due to the inherent characteristics of DRL\npolicies. The spatio-temporal dynamics of DRL problems and sequential\ndecision-making process make characterizing the decision boundary of DRL\npolicies more difficult, as well as searching for universal masks that capture\nthe geometry of it.\n","authors":["Buse G. A. Tekgul","N. Asokan"],"pdf_url":"https://arxiv.org/pdf/2307.14751v1.pdf","comment":"13 pages, 5 figures, 7 tables"},{"id":"http://arxiv.org/abs/2307.14748v1","updated":"2023-07-27T10:12:17Z","published":"2023-07-27T10:12:17Z","title":"Semantic Image Completion and Enhancement using GANs","summary":"  Semantic inpainting or image completion alludes to the task of inferring\narbitrary large missing regions in images based on image semantics. Since the\nprediction of image pixels requires an indication of high-level context, this\nmakes it significantly tougher than image completion, which is often more\nconcerned with correcting data corruption and removing entire objects from the\ninput image. On the other hand, image enhancement attempts to eliminate\nunwanted noise and blur from the image, along with sustaining most of the image\ndetails. Efficient image completion and enhancement model should be able to\nrecover the corrupted and masked regions in images and then refine the image\nfurther to increase the quality of the output image. Generative Adversarial\nNetworks (GAN), have turned out to be helpful in picture completion tasks. In\nthis chapter, we will discuss the underlying GAN architecture and how they can\nbe used used for image completion tasks.\n","authors":["Priyansh Saxena","Raahat Gupta","Akshat Maheshwari","Saumil Maheshwari"],"pdf_url":"https://arxiv.org/pdf/2307.14748v1.pdf","comment":"This work is part of 'High-Performance Vision Intelligence'; Part of\n  the Studies in Computational Intelligence book series (SCI, volume 913) and\n  can be accessed at:\n  https://link.springer.com/chapter/10.1007/978-981-15-6844-2_11. arXiv admin\n  note: substantial text overlap with arXiv:1911.02222"},{"id":"http://arxiv.org/abs/2011.09246v2","updated":"2023-07-27T10:09:57Z","published":"2020-11-18T12:37:25Z","title":"Experimental Study on Reinforcement Learning-based Control of an Acrobot","summary":"  We present computational and experimental results on how artificial\nintelligence (AI) learns to control an Acrobot using reinforcement learning\n(RL). Thereby the experimental setup is designed as an embedded system, which\nis of interest for robotics and energy harvesting applications. Specifically,\nwe study the control of angular velocity of the Acrobot, as well as control of\nits total energy, which is the sum of the kinetic and the potential energy. By\nthis means the RL algorithm is designed to drive the angular velocity or the\nenergy of the first pendulum of the Acrobot towards a desired value. With this,\nlibration or full rotation of the unactuated pendulum of the Acrobot is\nachieved. Moreover, investigations of the Acrobot control are carried out,\nwhich lead to insights about the influence of the state space discretization,\nthe episode length, the action space or the mass of the driven pendulum on the\nRL control. By further numerous simulations and experiments the effects of\nparameter variations are evaluated.\n","authors":["Leo Dostal","Alexej Bespalko","Daniel A. Duecker"],"pdf_url":"https://arxiv.org/pdf/2011.09246v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13106v2","updated":"2023-07-27T09:43:36Z","published":"2023-05-22T15:09:04Z","title":"On Learning the Tail Quantiles of Driving Behavior Distributions via\n  Quantile Regression and Flows","summary":"  Towards safe autonomous driving (AD), we consider the problem of learning\nmodels that accurately capture the diversity and tail quantiles of human driver\nbehavior probability distributions, in interaction with an AD vehicle. Such\nmodels, which predict drivers' continuous actions from their states, are\nparticularly relevant for closing the gap between AD agent simulations and\nreality. To this end, we adapt two flexible quantile learning frameworks for\nthis setting that avoid strong distributional assumptions: (1) quantile\nregression (based on the titled absolute loss), and (2) autoregressive quantile\nflows (a version of normalizing flows). Training happens in a behavior\ncloning-fashion. We use the highD dataset consisting of driver trajectories on\nseveral highways. We evaluate our approach in a one-step acceleration\nprediction task, and in multi-step driver simulation rollouts. We report\nquantitative results using the tilted absolute loss as metric, give qualitative\nexamples showing that realistic extremal behavior can be learned, and discuss\nthe main insights.\n","authors":["Jia Yu Tee","Oliver De Candido","Wolfgang Utschick","Philipp Geiger"],"pdf_url":"https://arxiv.org/pdf/2305.13106v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2307.14732v1","updated":"2023-07-27T09:42:25Z","published":"2023-07-27T09:42:25Z","title":"A Strategic Framework for Optimal Decisions in Football 1-vs-1\n  Shot-Taking Situations: An Integrated Approach of Machine Learning,\n  Theory-Based Modeling, and Game Theory","summary":"  Complex interactions between two opposing agents frequently occur in domains\nof machine learning, game theory, and other application domains. Quantitatively\nanalyzing the strategies involved can provide an objective basis for\ndecision-making. One such critical scenario is shot-taking in football, where\ndecisions, such as whether the attacker should shoot or pass the ball and\nwhether the defender should attempt to block the shot, play a crucial role in\nthe outcome of the game. However, there are currently no effective data-driven\nand/or theory-based approaches to analyzing such situations. To address this\nissue, we proposed a novel framework to analyze such scenarios based on game\ntheory, where we estimate the expected payoff with machine learning (ML)\nmodels, and additional features for ML models were extracted with a\ntheory-based shot block model. Conventionally, successes or failures (1 or 0)\nare used as payoffs, while a success shot (goal) is extremely rare in football.\nTherefore, we proposed the Expected Probability of Shot On Target (xSOT) metric\nto evaluate players' actions even if the shot results in no goal; this allows\nfor effective differentiation and comparison between different shots and even\nenables counterfactual shot situation analysis. In our experiments, we have\nvalidated the framework by comparing it with baseline and ablated models.\nFurthermore, we have observed a high correlation between the xSOT and existing\nmetrics. This alignment of information suggests that xSOT provides valuable\ninsights. Lastly, as an illustration, we studied optimal strategies in the\nWorld Cup 2022 and analyzed a shot situation in EURO 2020.\n","authors":["Calvin C. K. Yeung","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2307.14732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14729v1","updated":"2023-07-27T09:35:56Z","published":"2023-07-27T09:35:56Z","title":"Understanding Silent Failures in Medical Image Classification","summary":"  To ensure the reliable use of classification systems in medical applications,\nit is crucial to prevent silent failures. This can be achieved by either\ndesigning classifiers that are robust enough to avoid failures in the first\nplace, or by detecting remaining failures using confidence scoring functions\n(CSFs). A predominant source of failures in image classification is\ndistribution shifts between training data and deployment data. To understand\nthe current state of silent failure prevention in medical imaging, we conduct\nthe first comprehensive analysis comparing various CSFs in four biomedical\ntasks and a diverse range of distribution shifts. Based on the result that none\nof the benchmarked CSFs can reliably prevent silent failures, we conclude that\na deeper understanding of the root causes of failures in the data is required.\nTo facilitate this, we introduce SF-Visuals, an interactive analysis tool that\nuses latent space clustering to visualize shifts and failures. On the basis of\nvarious examples, we demonstrate how this tool can help researchers gain\ninsight into the requirements for safe application of classification systems in\nthe medical domain. The open-source benchmark and tool are at:\nhttps://github.com/IML-DKFZ/sf-visuals.\n","authors":["Till J. Bungert","Levin Kobelke","Paul F. Jaeger"],"pdf_url":"https://arxiv.org/pdf/2307.14729v1.pdf","comment":"Accepted at MICCAI 23"},{"id":"http://arxiv.org/abs/2306.05697v2","updated":"2023-07-27T09:24:31Z","published":"2023-06-09T06:34:16Z","title":"Group Equivariant Fourier Neural Operators for Partial Differential\n  Equations","summary":"  We consider solving partial differential equations (PDEs) with Fourier neural\noperators (FNOs), which operate in the frequency domain. Since the laws of\nphysics do not depend on the coordinate system used to describe them, it is\ndesirable to encode such symmetries in the neural operator architecture for\nbetter performance and easier learning. While encoding symmetries in the\nphysical domain using group theory has been studied extensively, how to capture\nsymmetries in the frequency domain is under-explored. In this work, we extend\ngroup convolutions to the frequency domain and design Fourier layers that are\nequivariant to rotations, translations, and reflections by leveraging the\nequivariance property of the Fourier transform. The resulting $G$-FNO\narchitecture generalizes well across input resolutions and performs well in\nsettings with varying levels of symmetry. Our code is publicly available as\npart of the AIRS library (https://github.com/divelab/AIRS).\n","authors":["Jacob Helwig","Xuan Zhang","Cong Fu","Jerry Kurtin","Stephan Wojtowytsch","Shuiwang Ji"],"pdf_url":"https://arxiv.org/pdf/2306.05697v2.pdf","comment":"Proceedings of the 40th International Conference on Machine Learning\n  https://icml.cc/virtual/2023/poster/23875"},{"id":"http://arxiv.org/abs/2307.14502v1","updated":"2023-07-27T09:20:38Z","published":"2023-07-27T09:20:38Z","title":"The Effect of Spoken Language on Speech Enhancement using\n  Self-Supervised Speech Representation Loss Functions","summary":"  Recent work in the field of speech enhancement (SE) has involved the use of\nself-supervised speech representations (SSSRs) as feature transformations in\nloss functions. However, in prior work, very little attention has been paid to\nthe relationship between the language of the audio used to train the\nself-supervised representation and that used to train the SE system.\nEnhancement models trained using a loss function which incorporates a\nself-supervised representation that shares exactly the language of the noisy\ndata used to train the SE system show better performance than those which do\nnot match exactly. This may lead to enhancement systems which are language\nspecific and as such do not generalise well to unseen languages, unlike models\ntrained using traditional spectrogram or time domain loss functions. In this\nwork, SE models are trained and tested on a number of different languages, with\nself-supervised representations which themselves are trained using different\nlanguage combinations and with differing network structures as loss function\nrepresentations. These models are then tested across unseen languages and their\nperformances are analysed. It is found that the training language of the\nself-supervised representation appears to have a minor effect on enhancement\nperformance, the amount of training data of a particular language, however,\ngreatly affects performance.\n","authors":["George Close","Thomas Hain","Stefan Goetze"],"pdf_url":"https://arxiv.org/pdf/2307.14502v1.pdf","comment":"Accepted at WASPAA 2023"},{"id":"http://arxiv.org/abs/2210.09924v2","updated":"2023-07-27T08:55:04Z","published":"2022-10-18T15:10:25Z","title":"Predicting Winning Regions in Parity Games via Graph Neural Networks\n  (Extended Abstract)","summary":"  Solving parity games is a major building block for numerous applications in\nreactive program verification and synthesis. While they can be solved\nefficiently in practice, no known approach has a polynomial worst-case runtime\ncomplexity. We present a incomplete polynomial-time approach to determining the\nwinning regions of parity games via graph neural networks.\n  Our evaluation on 900 randomly generated parity games shows that this\napproach is effective and efficient in practice. It correctly determines the\nwinning regions of $\\sim$60\\% of the games in our data set and only incurs\nminor errors in the remaining ones. We believe that this approach can be\nextended to efficiently solve parity games as well.\n","authors":["Tobias Hecking","Swathy Muthukrishnan","Alexander Weinert"],"pdf_url":"https://arxiv.org/pdf/2210.09924v2.pdf","comment":"4 pages, extended abstract. Presented at DAV'23"},{"id":"http://arxiv.org/abs/2307.13423v2","updated":"2023-07-27T08:41:36Z","published":"2023-07-25T11:42:52Z","title":"Non Intrusive Intelligibility Predictor for Hearing Impaired Individuals\n  using Self Supervised Speech Representations","summary":"  Self-supervised speech representations (SSSRs) have been successfully applied\nto a number of speech-processing tasks, e.g. as feature extractor for speech\nquality (SQ) prediction, which is, in turn, relevant for assessment and\ntraining speech enhancement systems for users with normal or impaired hearing.\nHowever, exact knowledge of why and how quality-related information is encoded\nwell in such representations remains poorly understood. In this work,\ntechniques for non-intrusive prediction of SQ ratings are extended to the\nprediction of intelligibility for hearing-impaired users. It is found that\nself-supervised representations are useful as input features to non-intrusive\nprediction models, achieving competitive performance to more complex systems. A\ndetailed analysis of the performance depending on Clarity Prediction Challenge\n1 listeners and enhancement systems indicates that more data might be needed to\nallow generalisation to unknown systems and (hearing-impaired) individuals\n","authors":["George Close","Thomas Hain","Stefan Goetze"],"pdf_url":"https://arxiv.org/pdf/2307.13423v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13494v3","updated":"2023-07-27T08:40:46Z","published":"2023-07-25T13:42:22Z","title":"Duet: efficient and scalable hybriD neUral rElation undersTanding","summary":"  Learned cardinality estimation methods have achieved high precision compared\nto traditional methods. Among learned methods, query-driven approaches face the\ndata and workload drift problem for a long time. Although both query-driven and\nhybrid methods are proposed to avoid this problem, even the state-of-art of\nthem suffer from high training and estimation costs, limited scalability,\ninstability, and long-tailed distribution problem on high cardinality and high\ndimensional tables, which seriously affects the practical application of\nlearned cardinality estimators. In this paper, we prove that most of these\nproblems are directly caused by the widely used progressive sampling. We solve\nthis problem by introducing predicates into the autoregressive model and\npropose Duet, a stable, efficient, and scalable hybrid method to estimate\ncardinality directly without sampling or any non-differentiable process, which\ncan not only reduces the inference complexity from $O(n)$ to $O(1)$ compared to\nNaru and UAE but also achieve higher accuracy on high cardinality and high\ndimensional tables. Experimental results show that Duet can achieve all the\ndesign goals above and be much more practical and even has a lower inference\ncost on CPU than that of most learned methods on GPU.\n","authors":["Kaixin Zhang","Hongzhi Wang","Yabin Lu","Ziqi Li","Chang Shu","Yu Yan","Donghua Yang"],"pdf_url":"https://arxiv.org/pdf/2307.13494v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10705v3","updated":"2023-07-27T08:23:19Z","published":"2023-07-20T08:53:47Z","title":"TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and\n  Lane Segmentation in Self-Driving Cars","summary":"  Semantic segmentation is a common task in autonomous driving to understand\nthe surrounding environment. Driveable Area Segmentation and Lane Detection are\nparticularly important for safe and efficient navigation on the road. However,\noriginal semantic segmentation models are computationally expensive and require\nhigh-end hardware, which is not feasible for embedded systems in autonomous\nvehicles. This paper proposes a lightweight model for the driveable area and\nlane line segmentation. TwinLiteNet is designed cheaply but achieves accurate\nand efficient segmentation results. We evaluate TwinLiteNet on the BDD100K\ndataset and compare it with modern models. Experimental results show that our\nTwinLiteNet performs similarly to existing approaches, requiring significantly\nfewer computational resources. Specifically, TwinLiteNet achieves a mIoU score\nof 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task\nwith only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000.\nFurthermore, TwinLiteNet can run in real-time on embedded devices with limited\ncomputing power, especially since it achieves 60FPS on Jetson Xavier NX, making\nit an ideal solution for self-driving vehicles. Code is available:\nurl{https://github.com/chequanghuy/TwinLiteNet}.\n","authors":["Quang Huy Che","Dinh Phuc Nguyen","Minh Quan Pham","Duc Khai Lam"],"pdf_url":"https://arxiv.org/pdf/2307.10705v3.pdf","comment":"This paper has been submitted to the Conference on Multimedia\n  Analysis and Pattern Recognition (MAPR), which will be held in held in Quy\n  Nhon on October 5-6, 2023"},{"id":"http://arxiv.org/abs/2307.14680v1","updated":"2023-07-27T08:10:19Z","published":"2023-07-27T08:10:19Z","title":"TimeGNN: Temporal Dynamic Graph Learning for Time Series Forecasting","summary":"  Time series forecasting lies at the core of important real-world applications\nin many fields of science and engineering. The abundance of large time series\ndatasets that consist of complex patterns and long-term dependencies has led to\nthe development of various neural network architectures. Graph neural network\napproaches, which jointly learn a graph structure based on the correlation of\nraw values of multivariate time series while forecasting, have recently seen\ngreat success. However, such solutions are often costly to train and difficult\nto scale. In this paper, we propose TimeGNN, a method that learns dynamic\ntemporal graph representations that can capture the evolution of inter-series\npatterns along with the correlations of multiple series. TimeGNN achieves\ninference times 4 to 80 times faster than other state-of-the-art graph-based\nmethods while achieving comparable forecasting performance\n","authors":["Nancy Xu","Chrysoula Kosma","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2307.14680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08411v2","updated":"2023-07-27T08:01:42Z","published":"2022-11-15T18:49:27Z","title":"Large Language Models Struggle to Learn Long-Tail Knowledge","summary":"  The Internet contains a wealth of knowledge -- from the birthdays of\nhistorical figures to tutorials on how to code -- all of which may be learned\nby language models. However, while certain pieces of information are ubiquitous\non the web, others appear extremely rarely. In this paper, we study the\nrelationship between the knowledge memorized by large language models and the\ninformation in pre-training datasets scraped from the web. In particular, we\nshow that a language model's ability to answer a fact-based question relates to\nhow many documents associated with that question were seen during pre-training.\nWe identify these relevant documents by entity linking pre-training datasets\nand counting documents that contain the same entities as a given\nquestion-answer pair. Our results demonstrate strong correlational and causal\nrelationships between accuracy and relevant document count for numerous\nquestion answering datasets (e.g., TriviaQA), pre-training corpora (e.g.,\nROOTS), and model sizes (e.g., 176B parameters). Moreover, while larger models\nare better at learning long-tail knowledge, we estimate that today's models\nmust be scaled by many orders of magnitude to reach competitive QA performance\non questions with little support in the pre-training data. Finally, we show\nthat retrieval-augmentation can reduce the dependence on relevant pre-training\ninformation, presenting a promising approach for capturing the long-tail.\n","authors":["Nikhil Kandpal","Haikang Deng","Adam Roberts","Eric Wallace","Colin Raffel"],"pdf_url":"https://arxiv.org/pdf/2211.08411v2.pdf","comment":"ICML 2023 Camera Ready Version"},{"id":"http://arxiv.org/abs/2307.14675v1","updated":"2023-07-27T07:58:38Z","published":"2023-07-27T07:58:38Z","title":"Prediction of wind turbines power with physics-informed neural networks\n  and evidential uncertainty quantification","summary":"  The ever-growing use of wind energy makes necessary the optimization of\nturbine operations through pitch angle controllers and their maintenance with\nearly fault detection. It is crucial to have accurate and robust models\nimitating the behavior of wind turbines, especially to predict the generated\npower as a function of the wind speed. Existing empirical and physics-based\nmodels have limitations in capturing the complex relations between the input\nvariables and the power, aggravated by wind variability. Data-driven methods\noffer new opportunities to enhance wind turbine modeling of large datasets by\nimproving accuracy and efficiency. In this study, we used physics-informed\nneural networks to reproduce historical data coming from 4 turbines in a wind\nfarm, while imposing certain physical constraints to the model. The developed\nmodels for regression of the power, torque, and power coefficient as output\nvariables showed great accuracy for both real data and physical equations\ngoverning the system. Lastly, introducing an efficient evidential layer\nprovided uncertainty estimations of the predictions, proved to be consistent\nwith the absolute error, and made possible the definition of a confidence\ninterval in the power curve.\n","authors":["Alfonso Gijón","Ainhoa Pujana-Goitia","Eugenio Perea","Miguel Molina-Solana","Juan Gómez-Romero"],"pdf_url":"https://arxiv.org/pdf/2307.14675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.07436v2","updated":"2023-07-27T07:47:49Z","published":"2022-09-15T16:33:36Z","title":"Statistical process monitoring of artificial neural networks","summary":"  The rapid advancement of models based on artificial intelligence demands\ninnovative monitoring techniques which can operate in real time with low\ncomputational costs. In machine learning, especially if we consider artificial\nneural networks (ANNs), the models are often trained in a supervised manner.\nConsequently, the learned relationship between the input and the output must\nremain valid during the model's deployment. If this stationarity assumption\nholds, we can conclude that the ANN provides accurate predictions. Otherwise,\nthe retraining or rebuilding of the model is required. We propose considering\nthe latent feature representation of the data (called \"embedding\") generated by\nthe ANN to determine the time when the data stream starts being nonstationary.\nIn particular, we monitor embeddings by applying multivariate control charts\nbased on the data depth calculation and normalized ranks. The performance of\nthe introduced method is compared with benchmark approaches for various ANN\narchitectures and different underlying data formats.\n","authors":["Anna Malinovskaya","Pavlo Mozharovskyi","Philipp Otto"],"pdf_url":"https://arxiv.org/pdf/2209.07436v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14668v1","updated":"2023-07-27T07:42:44Z","published":"2023-07-27T07:42:44Z","title":"Bipartite Ranking Fairness through a Model Agnostic Ordering Adjustment","summary":"  Algorithmic fairness has been a serious concern and received lots of interest\nin machine learning community. In this paper, we focus on the bipartite ranking\nscenario, where the instances come from either the positive or negative class\nand the goal is to learn a ranking function that ranks positive instances\nhigher than negative ones. While there could be a trade-off between fairness\nand performance, we propose a model agnostic post-processing framework xOrder\nfor achieving fairness in bipartite ranking and maintaining the algorithm\nclassification performance. In particular, we optimize a weighted sum of the\nutility as identifying an optimal warping path across different protected\ngroups and solve it through a dynamic programming process. xOrder is compatible\nwith various classification models and ranking fairness metrics, including\nsupervised and unsupervised fairness metrics. In addition to binary groups,\nxOrder can be applied to multiple protected groups. We evaluate our proposed\nalgorithm on four benchmark data sets and two real-world patient electronic\nhealth record repositories. xOrder consistently achieves a better balance\nbetween the algorithm utility and ranking fairness on a variety of datasets\nwith different metrics. From the visualization of the calibrated ranking\nscores, xOrder mitigates the score distribution shifts of different groups\ncompared with baselines. Moreover, additional analytical results verify that\nxOrder achieves a robust performance when faced with fewer samples and a bigger\ndifference between training and testing ranking score distributions.\n","authors":["Sen Cui","Weishen Pan","Changshui Zhang","Fei Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14668v1.pdf","comment":"This paper is accepted by IEEE Transactions on Pattern Analysis and\n  Machine Intelligence. arXiv admin note: substantial text overlap with\n  arXiv:2006.08267"},{"id":"http://arxiv.org/abs/2305.08890v2","updated":"2023-07-27T07:24:05Z","published":"2023-05-15T10:31:47Z","title":"Differential Convolutional Fuzzy Time Series Forecasting","summary":"  Fuzzy time series forecasting (FTSF) is a typical forecasting method with\nwide application. Traditional FTSF is regarded as an expert system which leads\nto loss of the ability to recognize undefined features. The mentioned is the\nmain reason for poor forecasting with FTSF. To solve the problem, the proposed\nmodel Differential Fuzzy Convolutional Neural Network (DFCNN) utilizes a\nconvolution neural network to re-implement FTSF with learnable ability. DFCNN\nis capable of recognizing potential information and improving forecasting\naccuracy. Thanks to the learnable ability of the neural network, the length of\nfuzzy rules established in FTSF is expended to an arbitrary length that the\nexpert is not able to handle by the expert system. At the same time, FTSF\nusually cannot achieve satisfactory performance of non-stationary time series\ndue to the trend of non-stationary time series. The trend of non-stationary\ntime series causes the fuzzy set established by FTSF to be invalid and causes\nthe forecasting to fail. DFCNN utilizes the Difference algorithm to weaken the\nnon-stationary of time series so that DFCNN can forecast the non-stationary\ntime series with a low error that FTSF cannot forecast in satisfactory\nperformance. After the mass of experiments, DFCNN has an excellent prediction\neffect, which is ahead of the existing FTSF and common time series forecasting\nalgorithms. Finally, DFCNN provides further ideas for improving FTSF and holds\ncontinued research value.\n","authors":["Tianxiang Zhan","Yuanpeng He","Yong Deng","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2305.08890v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.07959v2","updated":"2023-07-27T07:23:16Z","published":"2022-12-15T16:49:23Z","title":"Scalable Bayesian Uncertainty Quantification for Neural Network\n  Potentials: Promise and Pitfalls","summary":"  Neural network (NN) potentials promise highly accurate molecular dynamics\n(MD) simulations within the computational complexity of classical MD force\nfields. However, when applied outside their training domain, NN potential\npredictions can be inaccurate, increasing the need for Uncertainty\nQuantification (UQ). Bayesian modeling provides the mathematical framework for\nUQ, but classical Bayesian methods based on Markov chain Monte Carlo (MCMC) are\ncomputationally intractable for NN potentials. By training graph NN potentials\nfor coarse-grained systems of liquid water and alanine dipeptide, we\ndemonstrate here that scalable Bayesian UQ via stochastic gradient MCMC\n(SG-MCMC) yields reliable uncertainty estimates for MD observables. We show\nthat cold posteriors can reduce the required training data size and that for\nreliable UQ, multiple Markov chains are needed. Additionally, we find that\nSG-MCMC and the Deep Ensemble method achieve comparable results, despite\nshorter training and less hyperparameter tuning of the latter. We show that\nboth methods can capture aleatoric and epistemic uncertainty reliably, but not\nsystematic uncertainty, which needs to be minimized by adequate modeling to\nobtain accurate credible intervals for MD observables. Our results represent a\nstep towards accurate UQ that is of vital importance for trustworthy NN\npotential-based MD simulations required for decision-making in practice.\n","authors":["Stephan Thaler","Gregor Doehner","Julija Zavadlav"],"pdf_url":"https://arxiv.org/pdf/2212.07959v2.pdf","comment":"This is a post-peer-review, pre-copyedit version of an article\n  published in the Journal of Chemical Theory and Computation"},{"id":"http://arxiv.org/abs/2306.05965v2","updated":"2023-07-27T07:18:44Z","published":"2023-06-09T15:33:30Z","title":"Automating Model Comparison in Factor Graphs","summary":"  Bayesian state and parameter estimation have been automated effectively in a\nvariety of probabilistic programming languages. The process of model comparison\non the other hand, which still requires error-prone and time-consuming manual\nderivations, is often overlooked despite its importance. This paper efficiently\nautomates Bayesian model averaging, selection, and combination by message\npassing on a Forney-style factor graph with a custom mixture node. Parameter\nand state inference, and model comparison can then be executed simultaneously\nusing message passing with scale factors. This approach shortens the model\ndesign cycle and allows for the straightforward extension to hierarchical and\ntemporal model priors to accommodate for modeling complicated time-varying\nprocesses.\n","authors":["Bart van Erp","Wouter W. L. Nuijten","Thijs van de Laar","Bert de Vries"],"pdf_url":"https://arxiv.org/pdf/2306.05965v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14657v1","updated":"2023-07-27T07:18:10Z","published":"2023-07-27T07:18:10Z","title":"Decoding the Secrets of Machine Learning in Malware Classification: A\n  Deep Dive into Datasets, Feature Extraction, and Model Performance","summary":"  Many studies have proposed machine-learning (ML) models for malware detection\nand classification, reporting an almost-perfect performance. However, they\nassemble ground-truth in different ways, use diverse static- and\ndynamic-analysis techniques for feature extraction, and even differ on what\nthey consider a malware family. As a consequence, our community still lacks an\nunderstanding of malware classification results: whether they are tied to the\nnature and distribution of the collected dataset, to what extent the number of\nfamilies and samples in the training dataset influence performance, and how\nwell static and dynamic features complement each other.\n  This work sheds light on those open questions. by investigating the key\nfactors influencing ML-based malware detection and classification. For this, we\ncollect the largest balanced malware dataset so far with 67K samples from 670\nfamilies (100 samples each), and train state-of-the-art models for malware\ndetection and family classification using our dataset. Our results reveal that\nstatic features perform better than dynamic features, and that combining both\nonly provides marginal improvement over static features. We discover no\ncorrelation between packing and classification accuracy, and that missing\nbehaviors in dynamically-extracted features highly penalize their performance.\nWe also demonstrate how a larger number of families to classify make the\nclassification harder, while a higher number of samples per family increases\naccuracy. Finally, we find that models trained on a uniform distribution of\nsamples per family better generalize on unseen data.\n","authors":["Savino Dambra","Yufei Han","Simone Aonzo","Platon Kotzias","Antonino Vitale","Juan Caballero","Davide Balzarotti","Leyla Bilge"],"pdf_url":"https://arxiv.org/pdf/2307.14657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14654v1","updated":"2023-07-27T07:02:06Z","published":"2023-07-27T07:02:06Z","title":"Machine Learning based Parameter Sensitivity of Regional Climate Models\n  -- A Case Study of the WRF Model for Heat Extremes over Southeast Australia","summary":"  Heatwaves and bushfires cause substantial impacts on society and ecosystems\nacross the globe. Accurate information of heat extremes is needed to support\nthe development of actionable mitigation and adaptation strategies. Regional\nclimate models are commonly used to better understand the dynamics of these\nevents. These models have very large input parameter sets, and the parameters\nwithin the physics schemes substantially influence the model's performance.\nHowever, parameter sensitivity analysis (SA) of regional models for heat\nextremes is largely unexplored. Here, we focus on the southeast Australian\nregion, one of the global hotspots of heat extremes. In southeast Australia\nWeather Research and Forecasting (WRF) model is the widely used regional model\nto simulate extreme weather events across the region. Hence in this study, we\nfocus on the sensitivity of WRF model parameters to surface meteorological\nvariables such as temperature, relative humidity, and wind speed during two\nextreme heat events over southeast Australia. Due to the presence of multiple\nparameters and their complex relationship with output variables, a machine\nlearning (ML) surrogate-based global sensitivity analysis method is considered\nfor the SA. The ML surrogate-based Sobol SA is used to identify the sensitivity\nof 24 adjustable parameters in seven different physics schemes of the WRF\nmodel. Results show that out of these 24, only three parameters, namely the\nscattering tuning parameter, multiplier of saturated soil water content, and\nprofile shape exponent in the momentum diffusivity coefficient, are important\nfor the considered meteorological variables. These SA results are consistent\nfor the two different extreme heat events. Further, we investigated the\nphysical significance of sensitive parameters. This study's results will help\nin further optimising WRF parameters to improve model simulation.\n","authors":["P. Jyoteeshkumar Reddy","Sandeep Chinta","Richard Matear","John Taylor","Harish Baki","Marcus Thatcher","Jatin Kala","Jason Sharples"],"pdf_url":"https://arxiv.org/pdf/2307.14654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06848v3","updated":"2023-07-27T07:00:27Z","published":"2023-04-13T22:32:21Z","title":"CAR-DESPOT: Causally-Informed Online POMDP Planning for Robots in\n  Confounded Environments","summary":"  Robots operating in real-world environments must reason about possible\noutcomes of stochastic actions and make decisions based on partial observations\nof the true world state. A major challenge for making accurate and robust\naction predictions is the problem of confounding, which if left untreated can\nlead to prediction errors. The partially observable Markov decision process\n(POMDP) is a widely-used framework to model these stochastic and\npartially-observable decision-making problems. However, due to a lack of\nexplicit causal semantics, POMDP planning methods are prone to confounding bias\nand thus in the presence of unobserved confounders may produce underperforming\npolicies. This paper presents a novel causally-informed extension of \"anytime\nregularized determinized sparse partially observable tree\" (AR-DESPOT), a\nmodern anytime online POMDP planner, using causal modelling and inference to\neliminate errors caused by unmeasured confounder variables. We further propose\na method to learn offline the partial parameterisation of the causal model for\nplanning, from ground truth model data. We evaluate our methods on a toy\nproblem with an unobserved confounder and show that the learned causal model is\nhighly accurate, while our planning method is more robust to confounding and\nproduces overall higher performing policies than AR-DESPOT.\n","authors":["Ricardo Cannizzaro","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2304.06848v3.pdf","comment":"8 pages, 3 figures, accepted to 2023 IEEE/RSJ International\n  Conference on Intelligent Robots and Systems (IROS)"},{"id":"http://arxiv.org/abs/2307.14653v1","updated":"2023-07-27T06:59:46Z","published":"2023-07-27T06:59:46Z","title":"Speed Limits for Deep Learning","summary":"  State-of-the-art neural networks require extreme computational power to\ntrain. It is therefore natural to wonder whether they are optimally trained.\nHere we apply a recent advancement in stochastic thermodynamics which allows\nbounding the speed at which one can go from the initial weight distribution to\nthe final distribution of the fully trained network, based on the ratio of\ntheir Wasserstein-2 distance and the entropy production rate of the dynamical\nprocess connecting them. Considering both gradient-flow and Langevin training\ndynamics, we provide analytical expressions for these speed limits for linear\nand linearizable neural networks e.g. Neural Tangent Kernel (NTK). Remarkably,\ngiven some plausible scaling assumptions on the NTK spectra and spectral\ndecomposition of the labels -- learning is optimal in a scaling sense. Our\nresults are consistent with small-scale experiments with Convolutional Neural\nNetworks (CNNs) and Fully Connected Neural networks (FCNs) on CIFAR-10, showing\na short highly non-optimal regime followed by a longer optimal regime.\n","authors":["Inbar Seroussi","Alexander A. Alemi","Moritz Helias","Zohar Ringel"],"pdf_url":"https://arxiv.org/pdf/2307.14653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14648v1","updated":"2023-07-27T06:53:16Z","published":"2023-07-27T06:53:16Z","title":"Spatial-Frequency U-Net for Denoising Diffusion Probabilistic Models","summary":"  In this paper, we study the denoising diffusion probabilistic model (DDPM) in\nwavelet space, instead of pixel space, for visual synthesis. Considering the\nwavelet transform represents the image in spatial and frequency domains, we\ncarefully design a novel architecture SFUNet to effectively capture the\ncorrelation for both domains. Specifically, in the standard denoising U-Net for\npixel data, we supplement the 2D convolutions and spatial-only attention layers\nwith our spatial frequency-aware convolution and attention modules to jointly\nmodel the complementary information from spatial and frequency domains in\nwavelet data. Our new architecture can be used as a drop-in replacement to the\npixel-based network and is compatible with the vanilla DDPM training process.\nBy explicitly modeling the wavelet signals, we find our model is able to\ngenerate images with higher quality on CIFAR-10, FFHQ, LSUN-Bedroom, and\nLSUN-Church datasets, than the pixel-based counterpart.\n","authors":["Xin Yuan","Linjie Li","Jianfeng Wang","Zhengyuan Yang","Kevin Lin","Zicheng Liu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01669v2","updated":"2023-07-27T06:40:49Z","published":"2023-03-03T02:07:40Z","title":"Learning Common Rationale to Improve Self-Supervised Representation for\n  Fine-Grained Visual Recognition Problems","summary":"  Self-supervised learning (SSL) strategies have demonstrated remarkable\nperformance in various recognition tasks. However, both our preliminary\ninvestigation and recent studies suggest that they may be less effective in\nlearning representations for fine-grained visual recognition (FGVR) since many\nfeatures helpful for optimizing SSL objectives are not suitable for\ncharacterizing the subtle differences in FGVR. To overcome this issue, we\npropose learning an additional screening mechanism to identify discriminative\nclues commonly seen across instances and classes, dubbed as common rationales\nin this paper. Intuitively, common rationales tend to correspond to the\ndiscriminative patterns from the key parts of foreground objects. We show that\na common rationale detector can be learned by simply exploiting the GradCAM\ninduced from the SSL objective without using any pre-trained object parts or\nsaliency detectors, making it seamlessly to be integrated with the existing SSL\nprocess. Specifically, we fit the GradCAM with a branch with limited fitting\ncapacity, which allows the branch to capture the common rationales and discard\nthe less common discriminative patterns. At the test stage, the branch\ngenerates a set of spatial weights to selectively aggregate features\nrepresenting an instance. Extensive experimental results on four visual tasks\ndemonstrate that the proposed method can lead to a significant improvement in\ndifferent evaluation settings.\n","authors":["Yangyang Shu","Anton van den Hengel","Lingqiao Liu"],"pdf_url":"https://arxiv.org/pdf/2303.01669v2.pdf","comment":"CVPR 2023"},{"id":"http://arxiv.org/abs/2307.14643v1","updated":"2023-07-27T06:33:17Z","published":"2023-07-27T06:33:17Z","title":"MVMR-FS : Non-parametric feature selection algorithm based on Maximum\n  inter-class Variation and Minimum Redundancy","summary":"  How to accurately measure the relevance and redundancy of features is an\nage-old challenge in the field of feature selection. However, existing\nfilter-based feature selection methods cannot directly measure redundancy for\ncontinuous data. In addition, most methods rely on manually specifying the\nnumber of features, which may introduce errors in the absence of expert\nknowledge. In this paper, we propose a non-parametric feature selection\nalgorithm based on maximum inter-class variation and minimum redundancy,\nabbreviated as MVMR-FS. We first introduce supervised and unsupervised kernel\ndensity estimation on the features to capture their similarities and\ndifferences in inter-class and overall distributions. Subsequently, we present\nthe criteria for maximum inter-class variation and minimum redundancy (MVMR),\nwherein the inter-class probability distributions are employed to reflect\nfeature relevance and the distances between overall probability distributions\nare used to quantify redundancy. Finally, we employ an AGA to search for the\nfeature subset that minimizes the MVMR. Compared with ten state-of-the-art\nmethods, MVMR-FS achieves the highest average accuracy and improves the\naccuracy by 5% to 11%.\n","authors":["Haitao Nie","Shengbo Zhang","Bin Xie"],"pdf_url":"https://arxiv.org/pdf/2307.14643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14642v1","updated":"2023-07-27T06:32:43Z","published":"2023-07-27T06:32:43Z","title":"Linear Convergence of Black-Box Variational Inference: Should We Stick\n  the Landing?","summary":"  We prove that black-box variational inference (BBVI) with control variates,\nparticularly the sticking-the-landing (STL) estimator, converges at a geometric\n(traditionally called \"linear\") rate under perfect variational family\nspecification. In particular, we prove a quadratic bound on the gradient\nvariance of the STL estimator, one which encompasses misspecified variational\nfamilies. Combined with previous works on the quadratic variance condition,\nthis directly implies convergence of BBVI with the use of projected stochastic\ngradient descent. We also improve existing analysis on the regular closed-form\nentropy gradient estimators, which enables comparison against the STL estimator\nand provides explicit non-asymptotic complexity guarantees for both.\n","authors":["Kyurae Kim","Yian Ma","Jacob R. Gardner"],"pdf_url":"https://arxiv.org/pdf/2307.14642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.13381v4","updated":"2023-07-27T06:11:24Z","published":"2022-12-27T07:03:52Z","title":"MixupE: Understanding and Improving Mixup from Directional Derivative\n  Perspective","summary":"  Mixup is a popular data augmentation technique for training deep neural\nnetworks where additional samples are generated by linearly interpolating pairs\nof inputs and their labels. This technique is known to improve the\ngeneralization performance in many learning paradigms and applications. In this\nwork, we first analyze Mixup and show that it implicitly regularizes infinitely\nmany directional derivatives of all orders. Based on this new insight, we\npropose an improved version of Mixup, theoretically justified to deliver better\ngeneralization performance than the vanilla Mixup. To demonstrate the\neffectiveness of the proposed method, we conduct experiments across various\ndomains such as images, tabular data, speech, and graphs. Our results show that\nthe proposed method improves Mixup across multiple datasets using a variety of\narchitectures, for instance, exhibiting an improvement over Mixup by 0.8% in\nImageNet top-1 accuracy.\n","authors":["Yingtian Zou","Vikas Verma","Sarthak Mittal","Wai Hoh Tang","Hieu Pham","Juho Kannala","Yoshua Bengio","Arno Solin","Kenji Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2212.13381v4.pdf","comment":"16 pages, UAI 2023 oral presentation"},{"id":"http://arxiv.org/abs/2304.13037v2","updated":"2023-07-27T06:09:18Z","published":"2023-04-25T07:32:16Z","title":"VeML: An End-to-End Machine Learning Lifecycle for Large-scale and\n  High-dimensional Data","summary":"  An end-to-end machine learning (ML) lifecycle consists of many iterative\nprocesses, from data preparation and ML model design to model training and then\ndeploying the trained model for inference. When building an end-to-end\nlifecycle for an ML problem, many ML pipelines must be designed and executed\nthat produce a huge number of lifecycle versions. Therefore, this paper\nintroduces VeML, a Version management system dedicated to end-to-end ML\nLifecycle. Our system tackles several crucial problems that other systems have\nnot solved. First, we address the high cost of building an ML lifecycle,\nespecially for large-scale and high-dimensional dataset. We solve this problem\nby proposing to transfer the lifecycle of similar datasets managed in our\nsystem to the new training data. We design an algorithm based on the core set\nto compute similarity for large-scale, high-dimensional data efficiently.\nAnother critical issue is the model accuracy degradation by the difference\nbetween training data and testing data during the ML lifetime, which leads to\nlifecycle rebuild. Our system helps to detect this mismatch without getting\nlabeled data from testing data and rebuild the ML lifecycle for a new data\nversion. To demonstrate our contributions, we conduct experiments on\nreal-world, large-scale datasets of driving images and spatiotemporal sensor\ndata and show promising results.\n","authors":["Van-Duc Le","Cuong-Tien Bui","Wen-Syan Li"],"pdf_url":"https://arxiv.org/pdf/2304.13037v2.pdf","comment":"The updated version of this paper, titled \"Efficient ML Lifecycle\n  Transferring for Large-scale and High-dimensional Data via Core Set-based\n  Dataset Similarity,\" has been accepted for publication in IEEE Access"},{"id":"http://arxiv.org/abs/2212.01555v2","updated":"2023-07-27T05:58:50Z","published":"2022-12-03T06:53:38Z","title":"Contrastive Domain Adaptation for Time-Series via Temporal Mixup","summary":"  Unsupervised Domain Adaptation (UDA) has emerged as a powerful solution for\nthe domain shift problem via transferring the knowledge from a labeled source\ndomain to a shifted unlabeled target domain. Despite the prevalence of UDA for\nvisual applications, it remains relatively less explored for time-series\napplications. In this work, we propose a novel lightweight contrastive domain\nadaptation framework called CoTMix for time-series data. Unlike existing\napproaches that either use statistical distances or adversarial techniques, we\nleverage contrastive learning solely to mitigate the distribution shift across\nthe different domains. Specifically, we propose a novel temporal mixup strategy\nto generate two intermediate augmented views for the source and target domains.\nSubsequently, we leverage contrastive learning to maximize the similarity\nbetween each domain and its corresponding augmented view. The generated views\nconsider the temporal dynamics of time-series data during the adaptation\nprocess while inheriting the semantics among the two domains. Hence, we\ngradually push both domains towards a common intermediate space, mitigating the\ndistribution shift across them. Extensive experiments conducted on five\nreal-world time-series datasets show that our approach can significantly\noutperform all state-of-the-art UDA methods. The implementation code of CoTMix\nis available at\n\\href{https://github.com/emadeldeen24/CoTMix}{github.com/emadeldeen24/CoTMix}.\n","authors":["Emadeldeen Eldele","Mohamed Ragab","Zhenghua Chen","Min Wu","Chee-Keong Kwoh","Xiaoli Li"],"pdf_url":"https://arxiv.org/pdf/2212.01555v2.pdf","comment":"Accepted in the IEEE Transactions on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2307.14634v1","updated":"2023-07-27T05:49:24Z","published":"2023-07-27T05:49:24Z","title":"Fact-Checking of AI-Generated Reports","summary":"  With advances in generative artificial intelligence (AI), it is now possible\nto produce realistic-looking automated reports for preliminary reads of\nradiology images. This can expedite clinical workflows, improve accuracy and\nreduce overall costs. However, it is also well-known that such models often\nhallucinate, leading to false findings in the generated reports. In this paper,\nwe propose a new method of fact-checking of AI-generated reports using their\nassociated images. Specifically, the developed examiner differentiates real and\nfake sentences in reports by learning the association between an image and\nsentences describing real or potentially fake findings. To train such an\nexaminer, we first created a new dataset of fake reports by perturbing the\nfindings in the original ground truth radiology reports associated with images.\nText encodings of real and fake sentences drawn from these reports are then\npaired with image encodings to learn the mapping to real/fake labels. The\nutility of such an examiner is demonstrated for verifying automatically\ngenerated reports by detecting and removing fake sentences. Future generative\nAI approaches can use the resulting tool to validate their reports leading to a\nmore responsible use of AI in expediting clinical workflows.\n","authors":["Razi Mahmood","Ge Wang","Mannudeep Kalra","Pingkun Yan"],"pdf_url":"https://arxiv.org/pdf/2307.14634v1.pdf","comment":"10 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2302.01226v3","updated":"2023-07-27T05:29:14Z","published":"2023-02-02T17:06:50Z","title":"Factor Fields: A Unified Framework for Neural Fields and Beyond","summary":"  We present Factor Fields, a novel framework for modeling and representing\nsignals. Factor Fields decomposes a signal into a product of factors, each\nrepresented by a classical or neural field representation which operates on\ntransformed input coordinates. This decomposition results in a unified\nframework that accommodates several recent signal representations including\nNeRF, Plenoxels, EG3D, Instant-NGP, and TensoRF. Additionally, our framework\nallows for the creation of powerful new signal representations, such as the\n\"Dictionary Field\" (DiF) which is a second contribution of this paper. Our\nexperiments show that DiF leads to improvements in approximation quality,\ncompactness, and training time when compared to previous fast reconstruction\nmethods. Experimentally, our representation achieves better image approximation\nquality on 2D image regression tasks, higher geometric quality when\nreconstructing 3D signed distance fields, and higher compactness for radiance\nfield reconstruction tasks. Furthermore, DiF enables generalization to unseen\nimages/3D scenes by sharing bases across signals during training which greatly\nbenefits use cases such as image regression from sparse observations and\nfew-shot radiance field reconstruction.\n","authors":["Anpei Chen","Zexiang Xu","Xinyue Wei","Siyu Tang","Hao Su","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2302.01226v3.pdf","comment":"13 pages, 7 figures; Project Page:\n  https://apchenstu.github.io/FactorFields/"},{"id":"http://arxiv.org/abs/2302.12012v2","updated":"2023-07-27T05:23:22Z","published":"2023-02-17T20:58:17Z","title":"Empirical analysis of Different Dimensionality Reduction and\n  classification Techniques for Epileptic Seizure detection","summary":"  An Electroencephalogram (EEG) is a non-invasive exam that records the\nelectrical activity of the brain. This exam is used to help diagnose conditions\nsuch as different brain problems. EEG signals are taken for the purpose of\nepilepsy detection and with Discrete Wavelet Transform (DWT) and machine\nlearning classifier, they perform epilepsy detection. In Epilepsy seizure\ndetection, mainly machine learning classifiers and statistical features are\nused. The hidden information in the EEG signal is useful for detecting diseases\naffecting the brain. Sometimes it is very difficult to identify the minimum\nchanges in the EEG in the time and frequency domains purpose. The DWT can give\na good decomposition of the signals in different frequency bands and feature\nextraction. We use the tri-dimensionality reduction algorithm.; Principal\nComponent Analysis (PCA), Independent Component Analysis (ICA), and Linear\nDiscriminant Analysis (LDA). Finally, features are selected by using a fusion\nrule and at the last step three different classifiers Support Vector Machine\n(SVM), Naive Bayes (NB) and K-Nearest-Neighbor(KNN) have been used individually\nfor the classification. The proposed framework is tested on the Bonn dataset\nand the simulation results provide the accuracy for the combination of LDA and\nSVM 89.17%, LDA and KNN 80.42%, PCA and NB 89.92%, PCA and SVM 85.58%, PCA and\nKNN 80.42%, ICA and NB 82.33%, ICA and SVM 90.42%, and ICA and KNN 90%, LDA and\nNB 100%, accuracy. It shows the sensitivity, specificity, accuracy, Precision,\nand Recall of 100%, 100%, 100%, 100%, and 100%. This combination of LDA with NB\nmethod provides the accuracy of 100% outperforming all existing methods. The\nresults prove the effectiveness of this model.\n","authors":["Rabel Guharoy","Nanda Dulal Jana","Suparna Biswas","Lalit Garg"],"pdf_url":"https://arxiv.org/pdf/2302.12012v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.03328v2","updated":"2023-07-27T05:13:57Z","published":"2021-06-07T04:14:05Z","title":"Securing Secure Aggregation: Mitigating Multi-Round Privacy Leakage in\n  Federated Learning","summary":"  Secure aggregation is a critical component in federated learning (FL), which\nenables the server to learn the aggregate model of the users without observing\ntheir local models. Conventionally, secure aggregation algorithms focus only on\nensuring the privacy of individual users in a single training round. We contend\nthat such designs can lead to significant privacy leakages over multiple\ntraining rounds, due to partial user selection/participation at each round of\nFL. In fact, we show that the conventional random user selection strategies in\nFL lead to leaking users' individual models within number of rounds that is\nlinear in the number of users. To address this challenge, we introduce a secure\naggregation framework, Multi-RoundSecAgg, with multi-round privacy guarantees.\nIn particular, we introduce a new metric to quantify the privacy guarantees of\nFL over multiple training rounds, and develop a structured user selection\nstrategy that guarantees the long-term privacy of each user (over any number of\ntraining rounds). Our framework also carefully accounts for the fairness and\nthe average number of participating users at each round. Our experiments on\nMNIST and CIFAR-10 datasets in the IID and the non-IID settings demonstrate the\nperformance improvement over the baselines, both in terms of privacy protection\nand test accuracy.\n","authors":["Jinhyun So","Ramy E. Ali","Basak Guler","Jiantao Jiao","Salman Avestimehr"],"pdf_url":"https://arxiv.org/pdf/2106.03328v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14628v1","updated":"2023-07-27T05:08:49Z","published":"2023-07-27T05:08:49Z","title":"Rapid and Scalable Bayesian AB Testing","summary":"  AB testing aids business operators with their decision making, and is\nconsidered the gold standard method for learning from data to improve digital\nuser experiences. However, there is usually a gap between the requirements of\npractitioners, and the constraints imposed by the statistical hypothesis\ntesting methodologies commonly used for analysis of AB tests. These include the\nlack of statistical power in multivariate designs with many factors,\ncorrelations between these factors, the need of sequential testing for early\nstopping, and the inability to pool knowledge from past tests. Here, we propose\na solution that applies hierarchical Bayesian estimation to address the above\nlimitations. In comparison to current sequential AB testing methodology, we\nincrease statistical power by exploiting correlations between factors, enabling\nsequential testing and progressive early stopping, without incurring excessive\nfalse positive risk. We also demonstrate how this methodology can be extended\nto enable the extraction of composite global learnings from past AB tests, to\naccelerate future tests. We underpin our work with a solid theoretical\nframework that articulates the value of hierarchical estimation. We demonstrate\nits utility using both numerical simulations and a large set of real-world AB\ntests. Together, these results highlight the practical value of our approach\nfor statistical inference in the technology industry.\n","authors":["Srivas Chennu","Andrew Maher","Christian Pangerl","Subash Prabanantham","Jae Hyeon Bae","Jamie Martin","Bud Goswami"],"pdf_url":"https://arxiv.org/pdf/2307.14628v1.pdf","comment":"The 10th IEEE International Conference On Data Science And Advanced\n  Analytics"},{"id":"http://arxiv.org/abs/2307.14623v1","updated":"2023-07-27T04:47:05Z","published":"2023-07-27T04:47:05Z","title":"BubbleML: A Multi-Physics Dataset and Benchmarks for Machine Learning","summary":"  In the field of phase change phenomena, the lack of accessible and diverse\ndatasets suitable for machine learning (ML) training poses a significant\nchallenge. Existing experimental datasets are often restricted, with limited\navailability and sparse ground truth data, impeding our understanding of this\ncomplex multi-physics phenomena. To bridge this gap, we present the BubbleML\nDataset(https://github.com/HPCForge/BubbleML) which leverages physics-driven\nsimulations to provide accurate ground truth information for various boiling\nscenarios, encompassing nucleate pool boiling, flow boiling, and sub-cooled\nboiling. This extensive dataset covers a wide range of parameters, including\nvarying gravity conditions, flow rates, sub-cooling levels, and wall superheat,\ncomprising 51 simulations. BubbleML is validated against experimental\nobservations and trends, establishing it as an invaluable resource for ML\nresearch. Furthermore, we showcase its potential to facilitate exploration of\ndiverse downstream tasks by introducing two benchmarks: (a) optical flow\nanalysis to capture bubble dynamics, and (b) operator networks for learning\ntemperature dynamics. The BubbleML dataset and its benchmarks serve as a\ncatalyst for advancements in ML-driven research on multi-physics phase change\nphenomena, enabling the development and comparison of state-of-the-art\ntechniques and models.\n","authors":["Sheikh Md Shakeel Hassan","Arthur Feeney","Akash Dhruv","Jihoon Kim","Youngjoon Suh","Jaiyoung Ryu","Yoonjin Won","Aparna Chandramowlishwaran"],"pdf_url":"https://arxiv.org/pdf/2307.14623v1.pdf","comment":"Submitted to Neurips Datasets and Benchmarks Track 2023"},{"id":"http://arxiv.org/abs/2307.01524v2","updated":"2023-07-27T04:33:28Z","published":"2023-07-04T07:10:39Z","title":"Exploiting Richness of Learned Compressed Representation of Images for\n  Semantic Segmentation","summary":"  Autonomous vehicles and Advanced Driving Assistance Systems (ADAS) have the\npotential to radically change the way we travel. Many such vehicles currently\nrely on segmentation and object detection algorithms to detect and track\nobjects around its surrounding. The data collected from the vehicles are often\nsent to cloud servers to facilitate continual/life-long learning of these\nalgorithms. Considering the bandwidth constraints, the data is compressed\nbefore sending it to servers, where it is typically decompressed for training\nand analysis. In this work, we propose the use of a learning-based compression\nCodec to reduce the overhead in latency incurred for the decompression\noperation in the standard pipeline. We demonstrate that the learned compressed\nrepresentation can also be used to perform tasks like semantic segmentation in\naddition to decompression to obtain the images. We experimentally validate the\nproposed pipeline on the Cityscapes dataset, where we achieve a compression\nfactor up to $66 \\times$ while preserving the information required to perform\nsegmentation with a dice coefficient of $0.84$ as compared to $0.88$ achieved\nusing decompressed images while reducing the overall compute by $11\\%$.\n","authors":["Ravi Kakaiya","Rakshith Sathish","Ramanathan Sethuraman","Debdoot Sheet"],"pdf_url":"https://arxiv.org/pdf/2307.01524v2.pdf","comment":"Accepted at ICME 2023 (Industry Track)"},{"id":"http://arxiv.org/abs/2307.14619v1","updated":"2023-07-27T04:27:26Z","published":"2023-07-27T04:27:26Z","title":"Imitating Complex Trajectories: Bridging Low-Level Stability and\n  High-Level Behavior","summary":"  We propose a theoretical framework for studying the imitation of stochastic,\nnon-Markovian, potentially multi-modal (i.e. \"complex\" ) expert demonstrations\nin nonlinear dynamical systems. Our framework invokes low-level controllers -\neither learned or implicit in position-command control - to stabilize imitation\npolicies around expert demonstrations. We show that with (a) a suitable\nlow-level stability guarantee and (b) a stochastic continuity property of the\nlearned policy we call \"total variation continuity\" (TVC), an imitator that\naccurately estimates actions on the demonstrator's state distribution closely\nmatches the demonstrator's distribution over entire trajectories. We then show\nthat TVC can be ensured with minimal degradation of accuracy by combining a\npopular data-augmentation regimen with a novel algorithmic trick: adding\naugmentation noise at execution time. We instantiate our guarantees for\npolicies parameterized by diffusion models and prove that if the learner\naccurately estimates the score of the (noise-augmented) expert policy, then the\ndistribution of imitator trajectories is close to the demonstrator distribution\nin a natural optimal transport distance. Our analysis constructs intricate\ncouplings between noise-augmented trajectories, a technique that may be of\nindependent interest. We conclude by empirically validating our algorithmic\nrecommendations.\n","authors":["Adam Block","Daniel Pfrommer","Max Simchowitz"],"pdf_url":"https://arxiv.org/pdf/2307.14619v1.pdf","comment":"107 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.09916v3","updated":"2023-07-27T04:19:43Z","published":"2023-07-19T11:40:15Z","title":"TimeTuner: Diagnosing Time Representations for Time-Series Forecasting\n  with Counterfactual Explanations","summary":"  Deep learning (DL) approaches are being increasingly used for time-series\nforecasting, with many efforts devoted to designing complex DL models. Recent\nstudies have shown that the DL success is often attributed to effective data\nrepresentations, fostering the fields of feature engineering and representation\nlearning. However, automated approaches for feature learning are typically\nlimited with respect to incorporating prior knowledge, identifying interactions\namong variables, and choosing evaluation metrics to ensure that the models are\nreliable. To improve on these limitations, this paper contributes a novel\nvisual analytics framework, namely TimeTuner, designed to help analysts\nunderstand how model behaviors are associated with localized correlations,\nstationarity, and granularity of time-series representations. The system mainly\nconsists of the following two-stage technique: We first leverage counterfactual\nexplanations to connect the relationships among time-series representations,\nmultivariate features and model predictions. Next, we design multiple\ncoordinated views including a partition-based correlation matrix and juxtaposed\nbivariate stripes, and provide a set of interactions that allow users to step\ninto the transformation selection process, navigate through the feature space,\nand reason the model performance. We instantiate TimeTuner with two\ntransformation methods of smoothing and sampling, and demonstrate its\napplicability on real-world time-series forecasting of univariate sunspots and\nmultivariate air pollutants. Feedback from domain experts indicates that our\nsystem can help characterize time-series representations and guide the feature\nengineering processes.\n","authors":["Jianing Hao","Qing Shi","Yilin Ye","Wei Zeng"],"pdf_url":"https://arxiv.org/pdf/2307.09916v3.pdf","comment":"11 pages, 9 figures, this paper has been accepted by IEEE VIS 2023"},{"id":"http://arxiv.org/abs/2205.14704v4","updated":"2023-07-27T04:07:02Z","published":"2022-05-29T16:07:30Z","title":"Decoupling Knowledge from Memorization: Retrieval-augmented Prompt\n  Learning","summary":"  Prompt learning approaches have made waves in natural language processing by\ninducing better few-shot performance while they still follow a parametric-based\nlearning paradigm; the oblivion and rote memorization problems in learning may\nencounter unstable generalization issues. Specifically, vanilla prompt learning\nmay struggle to utilize atypical instances by rote during fully-supervised\ntraining or overfit shallow patterns with low-shot data. To alleviate such\nlimitations, we develop RetroPrompt with the motivation of decoupling knowledge\nfrom memorization to help the model strike a balance between generalization and\nmemorization. In contrast with vanilla prompt learning, RetroPrompt constructs\nan open-book knowledge-store from training instances and implements a retrieval\nmechanism during the process of input, training and inference, thus equipping\nthe model with the ability to retrieve related contexts from the training\ncorpus as cues for enhancement. Extensive experiments demonstrate that\nRetroPrompt can obtain better performance in both few-shot and zero-shot\nsettings. Besides, we further illustrate that our proposed RetroPrompt can\nyield better generalization abilities with new datasets. Detailed analysis of\nmemorization indeed reveals RetroPrompt can reduce the reliance of language\nmodels on memorization; thus, improving generalization for downstream tasks.\nCode is available in\nhttps://github.com/zjunlp/PromptKG/tree/main/research/RetroPrompt.\n","authors":["Xiang Chen","Lei Li","Ningyu Zhang","Xiaozhuan Liang","Shumin Deng","Chuanqi Tan","Fei Huang","Luo Si","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2205.14704v4.pdf","comment":"NeurIPS 2022 (Spotlight)"},{"id":"http://arxiv.org/abs/2307.14613v1","updated":"2023-07-27T04:00:23Z","published":"2023-07-27T04:00:23Z","title":"Self-Contrastive Graph Diffusion Network","summary":"  Augmentation techniques and sampling strategies are crucial in contrastive\nlearning, but in most existing works, augmentation techniques require careful\ndesign, and their sampling strategies can only capture a small amount of\nintrinsic supervision information. Additionally, the existing methods require\ncomplex designs to obtain two different representations of the data. To\novercome these limitations, we propose a novel framework called the\nSelf-Contrastive Graph Diffusion Network (SCGDN). Our framework consists of two\nmain components: the Attentional Module (AttM) and the Diffusion Module (DiFM).\nAttM aggregates higher-order structure and feature information to get an\nexcellent embedding, while DiFM balances the state of each node in the graph\nthrough Laplacian diffusion learning and allows the cooperative evolution of\nadjacency and feature information in the graph. Unlike existing methodologies,\nSCGDN is an augmentation-free approach that avoids \"sampling bias\" and semantic\ndrift, without the need for pre-training. We conduct a high-quality sampling of\nsamples based on structure and feature information. If two nodes are neighbors,\nthey are considered positive samples of each other. If two disconnected nodes\nare also unrelated on $k$NN graph, they are considered negative samples for\neach other. The contrastive objective reasonably uses our proposed sampling\nstrategies, and the redundancy reduction term minimizes redundant information\nin the embedding and can well retain more discriminative information. In this\nnovel framework, the graph self-contrastive learning paradigm gives expression\nto a powerful force. SCGDN effectively balances between preserving high-order\nstructure information and avoiding overfitting. The results manifest that SCGDN\ncan consistently generate outperformance over both the contrastive methods and\nthe classical methods.\n","authors":["Yixian Ma","Kun Zhan"],"pdf_url":"https://arxiv.org/pdf/2307.14613v1.pdf","comment":"ACM Multimedia 2013 Accpeted"},{"id":"http://arxiv.org/abs/2307.14609v1","updated":"2023-07-27T03:53:53Z","published":"2023-07-27T03:53:53Z","title":"Complete and separate: Conditional separation with missing target source\n  attribute completion","summary":"  Recent approaches in source separation leverage semantic information about\ntheir input mixtures and constituent sources that when used in conditional\nseparation models can achieve impressive performance. Most approaches along\nthese lines have focused on simple descriptions, which are not always useful\nfor varying types of input mixtures. In this work, we present an approach in\nwhich a model, given an input mixture and partial semantic information about a\ntarget source, is trained to extract additional semantic data. We then leverage\nthis pre-trained model to improve the separation performance of an uncoupled\nmulti-conditional separation network. Our experiments demonstrate that the\nseparation performance of this multi-conditional model is significantly\nimproved, approaching the performance of an oracle model with complete semantic\ninformation. Furthermore, our approach achieves performance levels that are\ncomparable to those of the best performing specialized single conditional\nmodels, thus providing an easier to use alternative.\n","authors":["Dimitrios Bralios","Efthymios Tzinis","Paris Smaragdis"],"pdf_url":"https://arxiv.org/pdf/2307.14609v1.pdf","comment":"Accepted to IEEE Workshop on Applications of Signal Processing to\n  Audio and Acoustics (WASPAA) 2023"},{"id":"http://arxiv.org/abs/2304.00570v2","updated":"2023-07-27T03:48:46Z","published":"2023-04-02T16:39:59Z","title":"FedFTN: Personalized Federated Learning with Deep Feature Transformation\n  Network for Multi-institutional Low-count PET Denoising","summary":"  Low-count PET is an efficient way to reduce radiation exposure and\nacquisition time, but the reconstructed images often suffer from low\nsignal-to-noise ratio (SNR), thus affecting diagnosis and other downstream\ntasks. Recent advances in deep learning have shown great potential in improving\nlow-count PET image quality, but acquiring a large, centralized, and diverse\ndataset from multiple institutions for training a robust model is difficult due\nto privacy and security concerns of patient data. Moreover, low-count PET data\nat different institutions may have different data distribution, thus requiring\npersonalized models. While previous federated learning (FL) algorithms enable\nmulti-institution collaborative training without the need of aggregating local\ndata, addressing the large domain shift in the application of\nmulti-institutional low-count PET denoising remains a challenge and is still\nhighly under-explored. In this work, we propose FedFTN, a personalized\nfederated learning strategy that addresses these challenges. FedFTN uses a\nlocal deep feature transformation network (FTN) to modulate the feature outputs\nof a globally shared denoising network, enabling personalized low-count PET\ndenoising for each institution. During the federated learning process, only the\ndenoising network's weights are communicated and aggregated, while the FTN\nremains at the local institutions for feature transformation. We evaluated our\nmethod using a large-scale dataset of multi-institutional low-count PET imaging\ndata from three medical centers located across three continents, and showed\nthat FedFTN provides high-quality low-count PET images, outperforming previous\nbaseline FL reconstruction methods across all low-count levels at all three\ninstitutions.\n","authors":["Bo Zhou","Huidong Xie","Qiong Liu","Xiongchao Chen","Xueqi Guo","Zhicheng Feng","S. Kevin Zhou","Biao Li","Axel Rominger","Kuangyu Shi","James S. Duncan","Chi Liu"],"pdf_url":"https://arxiv.org/pdf/2304.00570v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16573v4","updated":"2023-07-27T03:40:27Z","published":"2023-05-26T01:45:19Z","title":"Exploring Weight Balancing on Long-Tailed Recognition Problem","summary":"  Recognition problems in long-tailed data, where the sample size per class is\nheavily skewed, have recently gained importance because the distribution of the\nsample size per class in a dataset is generally exponential unless the sample\nsize is intentionally adjusted. Various approaches have been devised to address\nthese problems. Recently, weight balancing, which combines well-known classical\nregularization techniques with two-stage training, has been proposed. Despite\nits simplicity, it is known for its high performance against existing methods\ndevised in various ways. However, there is a lack of understanding as to why\nthis approach is effective for long-tailed data. In this study, we analyze the\nmethod focusing on neural collapse and cone effect at each training stage and\nfind that it can be decomposed into the increase in Fisher's discriminant ratio\nof the feature extractor caused by weight decay and cross entropy loss and\nimplicit logit adjustment caused by weight decay and class-balanced loss. Our\nanalysis shows that the training method can be further simplified by reducing\nthe number of training stages to one while increasing accuracy.\n","authors":["Naoya Hasegawa","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2305.16573v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10473v2","updated":"2023-07-27T02:53:41Z","published":"2023-06-18T04:30:12Z","title":"2D-Shapley: A Framework for Fragmented Data Valuation","summary":"  Data valuation -- quantifying the contribution of individual data sources to\ncertain predictive behaviors of a model -- is of great importance to enhancing\nthe transparency of machine learning and designing incentive systems for data\nsharing. Existing work has focused on evaluating data sources with the shared\nfeature or sample space. How to valuate fragmented data sources of which each\nonly contains partial features and samples remains an open question. We start\nby presenting a method to calculate the counterfactual of removing a fragment\nfrom the aggregated data matrix. Based on the counterfactual calculation, we\nfurther propose 2D-Shapley, a theoretical framework for fragmented data\nvaluation that uniquely satisfies some appealing axioms in the fragmented data\ncontext. 2D-Shapley empowers a range of new use cases, such as selecting useful\ndata fragments, providing interpretation for sample-wise data values, and\nfine-grained data issue diagnosis.\n","authors":["Zhihong Liu","Hoang Anh Just","Xiangyu Chang","Xi Chen","Ruoxi Jia"],"pdf_url":"https://arxiv.org/pdf/2306.10473v2.pdf","comment":"25 pages, 13 figures, ICML 2023"},{"id":"http://arxiv.org/abs/2307.14596v1","updated":"2023-07-27T02:43:21Z","published":"2023-07-27T02:43:21Z","title":"HUTFormer: Hierarchical U-Net Transformer for Long-Term Traffic\n  Forecasting","summary":"  Traffic forecasting, which aims to predict traffic conditions based on\nhistorical observations, has been an enduring research topic and is widely\nrecognized as an essential component of intelligent transportation. Recent\nproposals on Spatial-Temporal Graph Neural Networks (STGNNs) have made\nsignificant progress by combining sequential models with graph convolution\nnetworks. However, due to high complexity issues, STGNNs only focus on\nshort-term traffic forecasting, e.g., 1-hour forecasting, while ignoring more\npractical long-term forecasting. In this paper, we make the first attempt to\nexplore long-term traffic forecasting, e.g., 1-day forecasting. To this end, we\nfirst reveal its unique challenges in exploiting multi-scale representations.\nThen, we propose a novel Hierarchical U-net TransFormer (HUTFormer) to address\nthe issues of long-term traffic forecasting. HUTFormer consists of a\nhierarchical encoder and decoder to jointly generate and utilize multi-scale\nrepresentations of traffic data. Specifically, for the encoder, we propose\nwindow self-attention and segment merging to extract multi-scale\nrepresentations from long-term traffic data. For the decoder, we design a\ncross-scale attention mechanism to effectively incorporate multi-scale\nrepresentations. In addition, HUTFormer employs an efficient input embedding\nstrategy to address the complexity issues. Extensive experiments on four\ntraffic datasets show that the proposed HUTFormer significantly outperforms\nstate-of-the-art traffic forecasting and long time series forecasting\nbaselines.\n","authors":["Zezhi Shao","Fei Wang","Zhao Zhang","Yuchen Fang","Guangyin Jin","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2307.14596v1.pdf","comment":"TKDE Under Review"},{"id":"http://arxiv.org/abs/2306.10045v7","updated":"2023-07-27T02:32:19Z","published":"2023-06-12T07:19:01Z","title":"Efficient Approximations of Complete Interatomic Potentials for Crystal\n  Property Prediction","summary":"  We study property prediction for crystal materials. A crystal structure\nconsists of a minimal unit cell that is repeated infinitely in 3D space. How to\naccurately represent such repetitive structures in machine learning models\nremains unresolved. Current methods construct graphs by establishing edges only\nbetween nearby nodes, thereby failing to faithfully capture infinite repeating\npatterns and distant interatomic interactions. In this work, we propose several\ninnovations to overcome these limitations. First, we propose to model\nphysics-principled interatomic potentials directly instead of only using\ndistances as in many existing methods. These potentials include the Coulomb\npotential, London dispersion potential, and Pauli repulsion potential. Second,\nwe model the complete set of potentials among all atoms, instead of only\nbetween nearby atoms as in existing methods. This is enabled by our\napproximations of infinite potential summations with provable error bounds. We\nfurther develop efficient algorithms to compute the approximations. Finally, we\npropose to incorporate our computations of complete interatomic potentials into\nmessage passing neural networks for representation learning. We perform\nexperiments on the JARVIS and Materials Project benchmarks for evaluation.\nResults show that the use of interatomic potentials and complete interatomic\npotentials leads to consistent performance improvements with reasonable\ncomputational costs. Our code is publicly available as part of the AIRS library\n(https://github.com/divelab/AIRS/tree/main/OpenMat/PotNet).\n","authors":["Yuchao Lin","Keqiang Yan","Youzhi Luo","Yi Liu","Xiaoning Qian","Shuiwang Ji"],"pdf_url":"https://arxiv.org/pdf/2306.10045v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14588v1","updated":"2023-07-27T02:18:12Z","published":"2023-07-27T02:18:12Z","title":"MCPA: Multi-scale Cross Perceptron Attention Network for 2D Medical\n  Image Segmentation","summary":"  The UNet architecture, based on Convolutional Neural Networks (CNN), has\ndemonstrated its remarkable performance in medical image analysis. However, it\nfaces challenges in capturing long-range dependencies due to the limited\nreceptive fields and inherent bias of convolutional operations. Recently,\nnumerous transformer-based techniques have been incorporated into the UNet\narchitecture to overcome this limitation by effectively capturing global\nfeature correlations. However, the integration of the Transformer modules may\nresult in the loss of local contextual information during the global feature\nfusion process. To overcome these challenges, we propose a 2D medical image\nsegmentation model called Multi-scale Cross Perceptron Attention Network\n(MCPA). The MCPA consists of three main components: an encoder, a decoder, and\na Cross Perceptron. The Cross Perceptron first captures the local correlations\nusing multiple Multi-scale Cross Perceptron modules, facilitating the fusion of\nfeatures across scales. The resulting multi-scale feature vectors are then\nspatially unfolded, concatenated, and fed through a Global Perceptron module to\nmodel global dependencies. Furthermore, we introduce a Progressive Dual-branch\nStructure to address the semantic segmentation of the image involving finer\ntissue structures. This structure gradually shifts the segmentation focus of\nMCPA network training from large-scale structural features to more\nsophisticated pixel-level features. We evaluate our proposed MCPA model on\nseveral publicly available medical image datasets from different tasks and\ndevices, including the open large-scale dataset of CT (Synapse), MRI (ACDC),\nfundus camera (DRIVE, CHASE_DB1, HRF), and OCTA (ROSE). The experimental\nresults show that our MCPA model achieves state-of-the-art performance. The\ncode is available at\nhttps://github.com/simonustc/MCPA-for-2D-Medical-Image-Segmentation.\n","authors":["Liang Xu","Mingxiao Chen","Yi Cheng","Pengfei Shao","Shuwei Shen","Peng Yao","Ronald X. Xu"],"pdf_url":"https://arxiv.org/pdf/2307.14588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14568v1","updated":"2023-07-27T01:04:57Z","published":"2023-07-27T01:04:57Z","title":"Evaluation of Safety Constraints in Autonomous Navigation with Deep\n  Reinforcement Learning","summary":"  While reinforcement learning algorithms have had great success in the field\nof autonomous navigation, they cannot be straightforwardly applied to the real\nautonomous systems without considering the safety constraints. The later are\ncrucial to avoid unsafe behaviors of the autonomous vehicle on the road. To\nhighlight the importance of these constraints, in this study, we compare two\nlearnable navigation policies: safe and unsafe. The safe policy takes the\nconstraints into account, while the other does not. We show that the safe\npolicy is able to generate trajectories with more clearance (distance to the\nobstacles) and makes less collisions while training without sacrificing the\noverall performance.\n","authors":["Brian Angulo","Gregory Gorbov","Aleksandr Panov","Konstantin Yakovlev"],"pdf_url":"https://arxiv.org/pdf/2307.14568v1.pdf","comment":"4 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.14565v1","updated":"2023-07-27T00:55:54Z","published":"2023-07-27T00:55:54Z","title":"Auto-Tables: Synthesizing Multi-Step Transformations to Relationalize\n  Tables without Using Examples","summary":"  Relational tables, where each row corresponds to an entity and each column\ncorresponds to an attribute, have been the standard for tables in relational\ndatabases. However, such a standard cannot be taken for granted when dealing\nwith tables \"in the wild\". Our survey of real spreadsheet-tables and web-tables\nshows that over 30% of such tables do not conform to the relational standard,\nfor which complex table-restructuring transformations are needed before these\ntables can be queried easily using SQL-based analytics tools. Unfortunately,\nthe required transformations are non-trivial to program, which has become a\nsubstantial pain point for technical and non-technical users alike, as\nevidenced by large numbers of forum questions in places like StackOverflow and\nExcel/Tableau forums.\n  We develop an Auto-Tables system that can automatically synthesize pipelines\nwith multi-step transformations (in Python or other languages), to transform\nnon-relational tables into standard relational forms for downstream analytics,\nobviating the need for users to manually program transformations. We compile an\nextensive benchmark for this new task, by collecting 194 real test cases from\nuser spreadsheets and online forums. Our evaluation suggests that Auto-Tables\ncan successfully synthesize transformations for over 70% of test cases at\ninteractive speeds, without requiring any input from users, making this an\neffective tool for both technical and non-technical users to prepare data for\nanalytics.\n","authors":["Peng Li","Yeye He","Cong Yan","Yue Wang","Surajit Chauduri"],"pdf_url":"https://arxiv.org/pdf/2307.14565v1.pdf","comment":"full version of a paper accepted to VLDB 2023"},{"id":"http://arxiv.org/abs/2307.14549v1","updated":"2023-07-27T00:11:59Z","published":"2023-07-27T00:11:59Z","title":"Adversarial Sleeping Bandit Problems with Multiple Plays: Algorithm and\n  Ranking Application","summary":"  This paper presents an efficient algorithm to solve the sleeping bandit with\nmultiple plays problem in the context of an online recommendation system. The\nproblem involves bounded, adversarial loss and unknown i.i.d. distributions for\narm availability. The proposed algorithm extends the sleeping bandit algorithm\nfor single arm selection and is guaranteed to achieve theoretical performance\nwith regret upper bounded by $\\bigO(kN^2\\sqrt{T\\log T})$, where $k$ is the\nnumber of arms selected per time step, $N$ is the total number of arms, and $T$\nis the time horizon.\n","authors":["Jianjun Yuan","Wei Lee Woon","Ludovik Coba"],"pdf_url":"https://arxiv.org/pdf/2307.14549v1.pdf","comment":"Accepted by RecSys 2023 conference"},{"id":"http://arxiv.org/abs/2302.08720v2","updated":"2023-07-27T00:09:30Z","published":"2023-02-17T06:29:12Z","title":"Algorithmic Hallucinations of Near-Surface Winds: Statistical\n  Downscaling with Generative Adversarial Networks to Convection-Permitting\n  Scales","summary":"  This paper explores the application of emerging machine learning methods from\nimage super-resolution (SR) to the task of statistical downscaling. We\nspecifically focus on convolutional neural network-based Generative Adversarial\nNetworks (GANs). Our GANs are conditioned on low-resolution (LR) inputs to\ngenerate high-resolution (HR) surface winds emulating Weather Research and\nForecasting (WRF) model simulations over North America. Unlike traditional SR\nmodels, where LR inputs are idealized coarsened versions of the HR images, WRF\nemulation involves using non-idealized LR and HR pairs resulting in\nshared-scale mismatches due to internal variability. Our study builds upon\ncurrent SR-based statistical downscaling by experimenting with a novel\nfrequency-separation (FS) approach from the computer vision field. To assess\nthe skill of SR models, we carefully select evaluation metrics, and focus on\nperformance measures based on spatial power spectra. Our analyses reveal how\nGAN configurations influence spatial structures in the generated fields,\nparticularly biases in spatial variability spectra. Using power spectra to\nevaluate the FS experiments reveals that successful applications of FS in\ncomputer vision do not translate to climate fields. However, the FS experiments\ndemonstrate the sensitivity of power spectra to a commonly used GAN-based SR\nobjective function, which helps interpret and understand its role in\ndetermining spatial structures. This result motivates the development of a\nnovel partial frequency-separation scheme as a promising configuration option.\nWe also quantify the influence on GAN performance of non-idealized LR fields\nresulting from internal variability. Furthermore, we conduct a spectra-based\nfeature-importance experiment allowing us to explore the dependence of the\nspatial structure of generated fields on different physically relevant LR\ncovariates.\n","authors":["Nicolaas J. Annau","Alex J. Cannon","Adam H. Monahan"],"pdf_url":"https://arxiv.org/pdf/2302.08720v2.pdf","comment":"43 pages, including 11 main figures, and 16 supplemental figures"},{"id":"http://arxiv.org/abs/2209.06589v3","updated":"2023-07-27T00:02:11Z","published":"2022-09-14T12:13:59Z","title":"Towards Better Generalization with Flexible Representation of\n  Multi-Module Graph Neural Networks","summary":"  Graph neural networks (GNNs) have become compelling models designed to\nperform learning and inference on graph-structured data. However, little work\nhas been done to understand the fundamental limitations of GNNs for scaling to\nlarger graphs and generalizing to out-of-distribution (OOD) inputs. In this\npaper, we use a random graph generator to systematically investigate how the\ngraph size and structural properties affect the predictive performance of GNNs.\nWe present specific evidence that the average node degree is a key feature in\ndetermining whether GNNs can generalize to unseen graphs, and that the use of\nmultiple node update functions can improve the generalization performance of\nGNNs when dealing with graphs of multimodal degree distributions. Accordingly,\nwe propose a multi-module GNN framework that allows the network to adapt\nflexibly to new graphs by generalizing a single canonical nonlinear\ntransformation over aggregated inputs. Our results show that the multi-module\nGNNs improve the OOD generalization on a variety of inference tasks in the\ndirection of diverse structural features.\n","authors":["Hyungeun Lee","Kijung Yoon"],"pdf_url":"https://arxiv.org/pdf/2209.06589v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12247v3","updated":"2023-07-27T23:23:12Z","published":"2023-02-23T18:59:05Z","title":"Quantifying & Modeling Multimodal Interactions: An Information\n  Decomposition Framework","summary":"  The recent explosion of interest in multimodal applications has resulted in a\nwide selection of datasets and methods for representing and integrating\ninformation from different modalities. Despite these empirical advances, there\nremain fundamental research questions: How can we quantify the interactions\nthat are necessary to solve a multimodal task? Subsequently, what are the most\nsuitable multimodal models to capture these interactions? To answer these\nquestions, we propose an information-theoretic approach to quantify the degree\nof redundancy, uniqueness, and synergy relating input modalities with an output\ntask. We term these three measures as the PID statistics of a multimodal\ndistribution (or PID for short), and introduce two new estimators for these PID\nstatistics that scale to high-dimensional distributions. To validate PID\nestimation, we conduct extensive experiments on both synthetic datasets where\nthe PID is known and on large-scale multimodal benchmarks where PID estimations\nare compared with human annotations. Finally, we demonstrate their usefulness\nin (1) quantifying interactions within multimodal datasets, (2) quantifying\ninteractions captured by multimodal models, (3) principled approaches for model\nselection, and (4) three real-world case studies engaging with domain experts\nin pathology, mood prediction, and robotic perception where our framework helps\nto recommend strong multimodal models for each application.\n","authors":["Paul Pu Liang","Yun Cheng","Xiang Fan","Chun Kai Ling","Suzanne Nie","Richard Chen","Zihao Deng","Nicholas Allen","Randy Auerbach","Faisal Mahmood","Ruslan Salakhutdinov","Louis-Philippe Morency"],"pdf_url":"https://arxiv.org/pdf/2302.12247v3.pdf","comment":"Code available at: https://github.com/pliang279/PID"},{"id":"http://arxiv.org/abs/2202.02419v2","updated":"2023-07-27T22:56:22Z","published":"2022-02-04T22:39:03Z","title":"Learning a Discrete Set of Optimal Allocation Rules in a Queueing System\n  with Unknown Service Rate","summary":"  Motivated by the wide range of modern applications of the Erlang-B blocking\nmodel beyond communication networks and call centers to sizing and pricing in\ndesign production systems, messaging systems, and app-based parking systems, we\nstudy admission control for such a system but with unknown arrival and service\nrates. In our model, at every job arrival, a dispatcher decides to assign the\njob to an available server or block it. Every served job yields a fixed reward\nfor the dispatcher, but it also results in a cost per unit time of service. Our\ngoal is to design a dispatching policy that maximizes the long-term average\nreward for the dispatcher based on observing only the arrival times and the\nstate of the system at each arrival that reflects a realistic sampling of such\nsystems. Critically, the dispatcher observes neither the service times nor\ndeparture times so that standard reinforcement learning-based approaches that\nuse reward signals do not apply. Hence, we develop our learning-based dispatch\nscheme as a parametric learning problem a'la self-tuning adaptive control. In\nour problem, certainty equivalent control switches between an always admit if\nroom policy (explore infinitely often) and a never admit policy (immediately\nterminate learning), which is distinct from the adaptive control literature.\nHence, our learning scheme judiciously uses the always admit if room policy so\nthat learning doesn't stall. We prove that for all service rates, the proposed\npolicy asymptotically learns to take the optimal action and present finite-time\nregret guarantees. The extreme contrast in the certainty equivalent optimal\ncontrol policies leads to difficulties in learning that show up in our regret\nbounds for different parameter regimes: constant regret in one regime versus\nregret growing logarithmically in the other.\n","authors":["Saghar Adler","Mehrdad Moharrami","Vijay Subramanian"],"pdf_url":"https://arxiv.org/pdf/2202.02419v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15217v1","updated":"2023-07-27T22:29:25Z","published":"2023-07-27T22:29:25Z","title":"Open Problems and Fundamental Limitations of Reinforcement Learning from\n  Human Feedback","summary":"  Reinforcement learning from human feedback (RLHF) is a technique for training\nAI systems to align with human goals. RLHF has emerged as the central method\nused to finetune state-of-the-art large language models (LLMs). Despite this\npopularity, there has been relatively little public work systematizing its\nflaws. In this paper, we (1) survey open problems and fundamental limitations\nof RLHF and related methods; (2) overview techniques to understand, improve,\nand complement RLHF in practice; and (3) propose auditing and disclosure\nstandards to improve societal oversight of RLHF systems. Our work emphasizes\nthe limitations of RLHF and highlights the importance of a multi-faceted\napproach to the development of safer AI systems.\n","authors":["Stephen Casper","Xander Davies","Claudia Shi","Thomas Krendl Gilbert","Jérémy Scheurer","Javier Rando","Rachel Freedman","Tomasz Korbak","David Lindner","Pedro Freire","Tony Wang","Samuel Marks","Charbel-Raphaël Segerie","Micah Carroll","Andi Peng","Phillip Christoffersen","Mehul Damani","Stewart Slocum","Usman Anwar","Anand Siththaranjan","Max Nadeau","Eric J. Michaud","Jacob Pfau","Dmitrii Krasheninnikov","Xin Chen","Lauro Langosco","Peter Hase","Erdem Bıyık","Anca Dragan","David Krueger","Dorsa Sadigh","Dylan Hadfield-Menell"],"pdf_url":"https://arxiv.org/pdf/2307.15217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13124v2","updated":"2023-07-27T22:12:56Z","published":"2023-07-24T20:45:39Z","title":"Conformal prediction for frequency-severity modeling","summary":"  We present a nonparametric model-agnostic framework for building prediction\nintervals of insurance claims, with finite sample statistical guarantees,\nextending the technique of split conformal prediction to the domain of\ntwo-stage frequency-severity modeling. The effectiveness of the framework is\nshowcased with simulated and real datasets. When the underlying severity model\nis a random forest, we extend the two-stage split conformal prediction\nprocedure, showing how the out-of-bag mechanism can be leveraged to eliminate\nthe need for a calibration set and to enable the production of prediction\nintervals with adaptive width.\n","authors":["Helton Graziadei","Paulo C. Marques F.","Eduardo F. L. de Melo","Rodrigo S. Targino"],"pdf_url":"https://arxiv.org/pdf/2307.13124v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15199v1","updated":"2023-07-27T21:14:46Z","published":"2023-07-27T21:14:46Z","title":"PromptStyler: Prompt-driven Style Generation for Source-free Domain\n  Generalization","summary":"  In a joint vision-language space, a text feature (e.g., from \"a photo of a\ndog\") could effectively represent its relevant image features (e.g., from dog\nphotos). Inspired by this, we propose PromptStyler which simulates various\ndistribution shifts in the joint space by synthesizing diverse styles via\nprompts without using any images to deal with source-free domain\ngeneralization. Our method learns to generate a variety of style features (from\n\"a S* style of a\") via learnable style word vectors for pseudo-words S*. To\nensure that learned styles do not distort content information, we force\nstyle-content features (from \"a S* style of a [class]\") to be located nearby\ntheir corresponding content features (from \"[class]\") in the joint\nvision-language space. After learning style word vectors, we train a linear\nclassifier using synthesized style-content features. PromptStyler achieves the\nstate of the art on PACS, VLCS, OfficeHome and DomainNet, although it does not\nrequire any images and takes just ~30 minutes for training using a single GPU.\n","authors":["Junhyeong Cho","Gilhyun Nam","Sungyeon Kim","Hunmin Yang","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2307.15199v1.pdf","comment":"Accepted to ICCV 2023, Project Page: https://promptstyler.github.io/"},{"id":"http://arxiv.org/abs/2307.15196v1","updated":"2023-07-27T21:01:26Z","published":"2023-07-27T21:01:26Z","title":"The Marginal Value of Momentum for Small Learning Rate SGD","summary":"  Momentum is known to accelerate the convergence of gradient descent in\nstrongly convex settings without stochastic gradient noise. In stochastic\noptimization, such as training neural networks, folklore suggests that momentum\nmay help deep learning optimization by reducing the variance of the stochastic\ngradient update, but previous theoretical analyses do not find momentum to\noffer any provable acceleration. Theoretical results in this paper clarify the\nrole of momentum in stochastic settings where the learning rate is small and\ngradient noise is the dominant source of instability, suggesting that SGD with\nand without momentum behave similarly in the short and long time horizons.\nExperiments show that momentum indeed has limited benefits for both\noptimization and generalization in practical training regimes where the optimal\nlearning rate is not very large, including small- to medium-batch training from\nscratch on ImageNet and fine-tuning language models on downstream tasks.\n","authors":["Runzhe Wang","Sadhika Malladi","Tianhao Wang","Kaifeng Lyu","Zhiyuan Li"],"pdf_url":"https://arxiv.org/pdf/2307.15196v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15193v1","updated":"2023-07-27T20:49:28Z","published":"2023-07-27T20:49:28Z","title":"Learning in Repeated Multi-Unit Pay-As-Bid Auctions","summary":"  Motivated by Carbon Emissions Trading Schemes, Treasury Auctions, and\nProcurement Auctions, which all involve the auctioning of homogeneous multiple\nunits, we consider the problem of learning how to bid in repeated multi-unit\npay-as-bid auctions. In each of these auctions, a large number of (identical)\nitems are to be allocated to the largest submitted bids, where the price of\neach of the winning bids is equal to the bid itself. The problem of learning\nhow to bid in pay-as-bid auctions is challenging due to the combinatorial\nnature of the action space. We overcome this challenge by focusing on the\noffline setting, where the bidder optimizes their vector of bids while only\nhaving access to the past submitted bids by other bidders. We show that the\noptimal solution to the offline problem can be obtained using a polynomial time\ndynamic programming (DP) scheme. We leverage the structure of the DP scheme to\ndesign online learning algorithms with polynomial time and space complexity\nunder full information and bandit feedback settings. We achieve an upper bound\non regret of $O(M\\sqrt{T\\log |\\mathcal{B}|})$ and $O(M\\sqrt{|\\mathcal{B}|T\\log\n|\\mathcal{B}|})$ respectively, where $M$ is the number of units demanded by the\nbidder, $T$ is the total number of auctions, and $|\\mathcal{B}|$ is the size of\nthe discretized bid space. We accompany these results with a regret lower\nbound, which match the linear dependency in $M$. Our numerical results suggest\nthat when all agents behave according to our proposed no regret learning\nalgorithms, the resulting market dynamics mainly converge to a welfare\nmaximizing equilibrium where bidders submit uniform bids. Lastly, our\nexperiments demonstrate that the pay-as-bid auction consistently generates\nsignificantly higher revenue compared to its popular alternative, the uniform\nprice auction.\n","authors":["Rigel Galgana","Negin Golrezaei"],"pdf_url":"https://arxiv.org/pdf/2307.15193v1.pdf","comment":"51 pages, 12 Figures"},{"id":"http://arxiv.org/abs/2307.15190v1","updated":"2023-07-27T20:39:06Z","published":"2023-07-27T20:39:06Z","title":"f-Divergence Minimization for Sequence-Level Knowledge Distillation","summary":"  Knowledge distillation (KD) is the process of transferring knowledge from a\nlarge model to a small one. It has gained increasing attention in the natural\nlanguage processing community, driven by the demands of compressing\never-growing language models. In this work, we propose an f-DISTILL framework,\nwhich formulates sequence-level knowledge distillation as minimizing a\ngeneralized f-divergence function. We propose four distilling variants under\nour framework and show that existing SeqKD and ENGINE approaches are\napproximations of our f-DISTILL methods. We further derive step-wise\ndecomposition for our f-DISTILL, reducing intractable sequence-level divergence\nto word-level losses that can be computed in a tractable manner. Experiments\nacross four datasets show that our methods outperform existing KD approaches,\nand that our symmetric distilling losses can better force the student to learn\nfrom the teacher distribution.\n","authors":["Yuqiao Wen","Zichao Li","Wenyu Du","Lili Mou"],"pdf_url":"https://arxiv.org/pdf/2307.15190v1.pdf","comment":"Accepted by ACL 2023"},{"id":"http://arxiv.org/abs/2304.07810v2","updated":"2023-07-27T20:24:42Z","published":"2023-04-16T15:29:03Z","title":"VISAR: A Human-AI Argumentative Writing Assistant with Visual\n  Programming and Rapid Draft Prototyping","summary":"  In argumentative writing, writers must brainstorm hierarchical writing goals,\nensure the persuasiveness of their arguments, and revise and organize their\nplans through drafting. Recent advances in large language models (LLMs) have\nmade interactive text generation through a chat interface (e.g., ChatGPT)\npossible. However, this approach often neglects implicit writing context and\nuser intent, lacks support for user control and autonomy, and provides limited\nassistance for sensemaking and revising writing plans. To address these\nchallenges, we introduce VISAR, an AI-enabled writing assistant system designed\nto help writers brainstorm and revise hierarchical goals within their writing\ncontext, organize argument structures through synchronized text editing and\nvisual programming, and enhance persuasiveness with argumentation spark\nrecommendations. VISAR allows users to explore, experiment with, and validate\ntheir writing plans using automatic draft prototyping. A controlled lab study\nconfirmed the usability and effectiveness of VISAR in facilitating the\nargumentative writing planning process.\n","authors":["Zheng Zhang","Jie Gao","Ranjodh Singh Dhaliwal","Toby Jia-Jun Li"],"pdf_url":"https://arxiv.org/pdf/2304.07810v2.pdf","comment":"30 pages, published in UIST'23"},{"id":"http://arxiv.org/abs/2307.15176v1","updated":"2023-07-27T20:11:07Z","published":"2023-07-27T20:11:07Z","title":"RCT Rejection Sampling for Causal Estimation Evaluation","summary":"  Confounding is a significant obstacle to unbiased estimation of causal\neffects from observational data. For settings with high-dimensional covariates\n-- such as text data, genomics, or the behavioral social sciences --\nresearchers have proposed methods to adjust for confounding by adapting machine\nlearning methods to the goal of causal estimation. However, empirical\nevaluation of these adjustment methods has been challenging and limited. In\nthis work, we build on a promising empirical evaluation strategy that\nsimplifies evaluation design and uses real data: subsampling randomized\ncontrolled trials (RCTs) to create confounded observational datasets while\nusing the average causal effects from the RCTs as ground-truth. We contribute a\nnew sampling algorithm, which we call RCT rejection sampling, and provide\ntheoretical guarantees that causal identification holds in the observational\ndata to allow for valid comparisons to the ground-truth RCT. Using synthetic\ndata, we show our algorithm indeed results in low bias when oracle estimators\nare evaluated on the confounded samples, which is not always the case for a\npreviously proposed algorithm. In addition to this identification result, we\nhighlight several finite data considerations for evaluation designers who plan\nto use RCT rejection sampling on their own datasets. As a proof of concept, we\nimplement an example evaluation pipeline and walk through these finite data\nconsiderations with a novel, real-world RCT -- which we release publicly --\nconsisting of approximately 70k observations and text data as high-dimensional\ncovariates. Together, these contributions build towards a broader agenda of\nimproved empirical evaluation for causal estimation.\n","authors":["Katherine A. Keith","Sergey Feldman","David Jurgens","Jonathan Bragg","Rohit Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2307.15176v1.pdf","comment":"Code and data at https://github.com/kakeith/rct_rejection_sampling"},{"id":"http://arxiv.org/abs/2307.15175v1","updated":"2023-07-27T20:07:55Z","published":"2023-07-27T20:07:55Z","title":"Causative Cyberattacks on Online Learning-based Automated Demand\n  Response Systems","summary":"  Power utilities are adopting Automated Demand Response (ADR) to replace the\ncostly fuel-fired generators and to preempt congestion during peak electricity\ndemand. Similarly, third-party Demand Response (DR) aggregators are leveraging\ncontrollable small-scale electrical loads to provide on-demand grid support\nservices to the utilities. Some aggregators and utilities have started\nemploying Artificial Intelligence (AI) to learn the energy usage patterns of\nelectricity consumers and use this knowledge to design optimal DR incentives.\nSuch AI frameworks use open communication channels between the\nutility/aggregator and the DR customers, which are vulnerable to\n\\textit{causative} data integrity cyberattacks. This paper explores\nvulnerabilities of AI-based DR learning and designs a data-driven attack\nstrategy informed by DR data collected from the New York University (NYU)\ncampus buildings. The case study demonstrates the feasibility and effects of\nmaliciously tampering with (i) real-time DR incentives, (ii) DR event data sent\nto DR customers, and (iii) responses of DR customers to the DR incentives.\n","authors":["Samrat Acharya","Yury Dvorkin","Ramesh Karri"],"pdf_url":"https://arxiv.org/pdf/2307.15175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15168v1","updated":"2023-07-27T19:56:18Z","published":"2023-07-27T19:56:18Z","title":"PredictChain: Empowering Collaboration and Data Accessibility for AI in\n  a Decentralized Blockchain-based Marketplace","summary":"  Limited access to computing resources and training data poses significant\nchallenges for individuals and groups aiming to train and utilize predictive\nmachine learning models. Although numerous publicly available machine learning\nmodels exist, they are often unhosted, necessitating end-users to establish\ntheir computational infrastructure. Alternatively, these models may only be\naccessible through paid cloud-based mechanisms, which can prove costly for\ngeneral public utilization. Moreover, model and data providers require a more\nstreamlined approach to track resource usage and capitalize on subsequent usage\nby others, both financially and otherwise. An effective mechanism is also\nlacking to contribute high-quality data for improving model performance. We\npropose a blockchain-based marketplace called \"PredictChain\" for predictive\nmachine-learning models to address these issues. This marketplace enables users\nto upload datasets for training predictive machine learning models, request\nmodel training on previously uploaded datasets, or submit queries to trained\nmodels. Nodes within the blockchain network, equipped with available computing\nresources, will operate these models, offering a range of archetype machine\nlearning models with varying characteristics, such as cost, speed, simplicity,\npower, and cost-effectiveness. This decentralized approach empowers users to\ndevelop improved models accessible to the public, promotes data sharing, and\nreduces reliance on centralized cloud providers.\n","authors":["Matthew T. Pisano","Connor J. Patterson","Oshani Seneviratne"],"pdf_url":"https://arxiv.org/pdf/2307.15168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15164v1","updated":"2023-07-27T19:42:22Z","published":"2023-07-27T19:42:22Z","title":"VISU at WASSA 2023 Shared Task: Detecting Emotions in Reaction to News\n  Stories Leveraging BERT and Stacked Embeddings","summary":"  Our system, VISU, participated in the WASSA 2023 Shared Task (3) of Emotion\nClassification from essays written in reaction to news articles. Emotion\ndetection from complex dialogues is challenging and often requires\ncontext/domain understanding. Therefore in this research, we have focused on\ndeveloping deep learning (DL) models using the combination of word embedding\nrepresentations with tailored prepossessing strategies to capture the nuances\nof emotions expressed. Our experiments used static and contextual embeddings\n(individual and stacked) with Bidirectional Long short-term memory (BiLSTM) and\nTransformer based models. We occupied rank tenth in the emotion detection task\nby scoring a Macro F1-Score of 0.2717, validating the efficacy of our\nimplemented approaches for small and imbalanced datasets with mixed categories\nof target emotions.\n","authors":["Vivek Kumar","Sushmita Singh","Prayag Tiwari"],"pdf_url":"https://arxiv.org/pdf/2307.15164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15157v1","updated":"2023-07-27T19:11:31Z","published":"2023-07-27T19:11:31Z","title":"R-LPIPS: An Adversarially Robust Perceptual Similarity Metric","summary":"  Similarity metrics have played a significant role in computer vision to\ncapture the underlying semantics of images. In recent years, advanced\nsimilarity metrics, such as the Learned Perceptual Image Patch Similarity\n(LPIPS), have emerged. These metrics leverage deep features extracted from\ntrained neural networks and have demonstrated a remarkable ability to closely\nalign with human perception when evaluating relative image similarity. However,\nit is now well-known that neural networks are susceptible to adversarial\nexamples, i.e., small perturbations invisible to humans crafted to deliberately\nmislead the model. Consequently, the LPIPS metric is also sensitive to such\nadversarial examples. This susceptibility introduces significant security\nconcerns, especially considering the widespread adoption of LPIPS in\nlarge-scale applications. In this paper, we propose the Robust Learned\nPerceptual Image Patch Similarity (R-LPIPS) metric, a new metric that leverages\nadversarially trained deep features. Through a comprehensive set of\nexperiments, we demonstrate the superiority of R-LPIPS compared to the\nclassical LPIPS metric. The code is available at\n\\url{https://github.com/SaraGhazanfari/R-LPIPS}.\n","authors":["Sara Ghazanfari","Siddharth Garg","Prashanth Krishnamurthy","Farshad Khorrami","Alexandre Araujo"],"pdf_url":"https://arxiv.org/pdf/2307.15157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15154v1","updated":"2023-07-27T19:03:36Z","published":"2023-07-27T19:03:36Z","title":"A/B Testing and Best-arm Identification for Linear Bandits with\n  Robustness to Non-stationarity","summary":"  We investigate the fixed-budget best-arm identification (BAI) problem for\nlinear bandits in a potentially non-stationary environment. Given a finite arm\nset $\\mathcal{X}\\subset\\mathbb{R}^d$, a fixed budget $T$, and an unpredictable\nsequence of parameters $\\left\\lbrace\\theta_t\\right\\rbrace_{t=1}^{T}$, an\nalgorithm will aim to correctly identify the best arm $x^* :=\n\\arg\\max_{x\\in\\mathcal{X}}x^\\top\\sum_{t=1}^{T}\\theta_t$ with probability as\nhigh as possible. Prior work has addressed the stationary setting where\n$\\theta_t = \\theta_1$ for all $t$ and demonstrated that the error probability\ndecreases as $\\exp(-T /\\rho^*)$ for a problem-dependent constant $\\rho^*$. But\nin many real-world $A/B/n$ multivariate testing scenarios that motivate our\nwork, the environment is non-stationary and an algorithm expecting a stationary\nsetting can easily fail. For robust identification, it is well-known that if\narms are chosen randomly and non-adaptively from a G-optimal design over\n$\\mathcal{X}$ at each time then the error probability decreases as\n$\\exp(-T\\Delta^2_{(1)}/d)$, where $\\Delta_{(1)} = \\min_{x \\neq x^*} (x^* -\nx)^\\top \\frac{1}{T}\\sum_{t=1}^T \\theta_t$. As there exist environments where\n$\\Delta_{(1)}^2/ d \\ll 1/ \\rho^*$, we are motivated to propose a novel\nalgorithm $\\mathsf{P1}$-$\\mathsf{RAGE}$ that aims to obtain the best of both\nworlds: robustness to non-stationarity and fast rates of identification in\nbenign settings. We characterize the error probability of\n$\\mathsf{P1}$-$\\mathsf{RAGE}$ and demonstrate empirically that the algorithm\nindeed never performs worse than G-optimal design but compares favorably to the\nbest algorithms in the stationary setting.\n","authors":["Zhihan Xiong","Romain Camilleri","Maryam Fazel","Lalit Jain","Kevin Jamieson"],"pdf_url":"https://arxiv.org/pdf/2307.15154v1.pdf","comment":"25 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.15150v1","updated":"2023-07-27T18:53:14Z","published":"2023-07-27T18:53:14Z","title":"R-Block: Regularized Block of Dropout for convolutional networks","summary":"  Dropout as a regularization technique is widely used in fully connected\nlayers while is less effective in convolutional layers. Therefore more\nstructured forms of dropout have been proposed to regularize convolutional\nnetworks. The disadvantage of these methods is that the randomness introduced\ncauses inconsistency between training and inference. In this paper, we apply a\nmutual learning training strategy for convolutional layer regularization,\nnamely R-Block, which forces two outputs of the generated difference maximizing\nsub models to be consistent with each other. Concretely, R-Block minimizes the\nlosses between the output distributions of two sub models with different drop\nregions for each sample in the training dataset. We design two approaches to\nconstruct such sub models. Our experiments demonstrate that R-Block achieves\nbetter performance than other existing structured dropout variants. We also\ndemonstrate that our approaches to construct sub models outperforms others.\n","authors":["Liqi Wang","Qiya Hu"],"pdf_url":"https://arxiv.org/pdf/2307.15150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08164v3","updated":"2023-07-27T18:26:45Z","published":"2023-01-19T16:56:21Z","title":"DiME: Maximizing Mutual Information by a Difference of Matrix-Based\n  Entropies","summary":"  We introduce an information-theoretic quantity with similar properties to\nmutual information that can be estimated from data without making explicit\nassumptions on the underlying distribution. This quantity is based on a\nrecently proposed matrix-based entropy that uses the eigenvalues of a\nnormalized Gram matrix to compute an estimate of the eigenvalues of an\nuncentered covariance operator in a reproducing kernel Hilbert space. We show\nthat a difference of matrix-based entropies (DiME) is well suited for problems\ninvolving the maximization of mutual information between random variables.\nWhile many methods for such tasks can lead to trivial solutions, DiME naturally\npenalizes such outcomes. We compare DiME to several baseline estimators of\nmutual information on a toy Gaussian dataset. We provide examples of use cases\nfor DiME, such as latent factor disentanglement and a multiview representation\nlearning problem where DiME is used to learn a shared representation among\nviews with high mutual information.\n","authors":["Oscar Skean","Jhoan Keider Hoyos Osorio","Austin J. Brockmeier","Luis Gonzalo Sanchez Giraldo"],"pdf_url":"https://arxiv.org/pdf/2301.08164v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.04253v2","updated":"2023-07-27T18:13:40Z","published":"2023-01-11T00:22:56Z","title":"Towards Answering Climate Questionnaires from Unstructured Climate\n  Reports","summary":"  The topic of Climate Change (CC) has received limited attention in NLP\ndespite its urgency. Activists and policymakers need NLP tools to effectively\nprocess the vast and rapidly growing unstructured textual climate reports into\nstructured form. To tackle this challenge we introduce two new large-scale\nclimate questionnaire datasets and use their existing structure to train\nself-supervised models. We conduct experiments to show that these models can\nlearn to generalize to climate disclosures of different organizations types\nthan seen during training. We then use these models to help align texts from\nunstructured climate documents to the semi-structured questionnaires in a human\npilot study. Finally, to support further NLP research in the climate domain we\nintroduce a benchmark of existing climate text classification datasets to\nbetter evaluate and compare existing models.\n","authors":["Daniel Spokoyny","Tanmay Laud","Tom Corringham","Taylor Berg-Kirkpatrick"],"pdf_url":"https://arxiv.org/pdf/2301.04253v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15105v1","updated":"2023-07-27T17:48:29Z","published":"2023-07-27T17:48:29Z","title":"Detecting Morphing Attacks via Continual Incremental Training","summary":"  Scenarios in which restrictions in data transfer and storage limit the\npossibility to compose a single dataset -- also exploiting different data\nsources -- to perform a batch-based training procedure, make the development of\nrobust models particularly challenging. We hypothesize that the recent\nContinual Learning (CL) paradigm may represent an effective solution to enable\nincremental training, even through multiple sites. Indeed, a basic assumption\nof CL is that once a model has been trained, old data can no longer be used in\nsuccessive training iterations and in principle can be deleted. Therefore, in\nthis paper, we investigate the performance of different Continual Learning\nmethods in this scenario, simulating a learning model that is updated every\ntime a new chunk of data, even of variable size, is available. Experimental\nresults reveal that a particular CL method, namely Learning without Forgetting\n(LwF), is one of the best-performing algorithms. Then, we investigate its usage\nand parametrization in Morphing Attack Detection and Object Classification\ntasks, specifically with respect to the amount of new training data that became\navailable.\n","authors":["Lorenzo Pellegrini","Guido Borghi","Annalisa Franco","Davide Maltoni"],"pdf_url":"https://arxiv.org/pdf/2307.15105v1.pdf","comment":"Paper accepted in IJCB 2023 conference"},{"id":"http://arxiv.org/abs/2307.15101v1","updated":"2023-07-27T16:48:19Z","published":"2023-07-27T16:48:19Z","title":"Detection of Children Abuse by Voice and Audio Classification by\n  Short-Time Fourier Transform Machine Learning implemented on Nvidia Edge GPU\n  device","summary":"  The safety of children in children home has become an increasing social\nconcern, and the purpose of this experiment is to use machine learning applied\nto detect the scenarios of child abuse to increase the safety of children. This\nexperiment uses machine learning to classify and recognize a child's voice and\npredict whether the current sound made by the child is crying, screaming or\nlaughing. If a child is found to be crying or screaming, an alert is\nimmediately sent to the relevant personnel so that they can perceive what the\nchild may be experiencing in a surveillance blind spot and respond in a timely\nmanner. Together with a hybrid use of video image classification, the accuracy\nof child abuse detection can be significantly increased. This greatly reduces\nthe likelihood that a child will receive violent abuse in the nursery and\nallows personnel to stop an imminent or incipient child abuse incident in time.\nThe datasets collected from this experiment is entirely from sounds recorded on\nsite at the children home, including crying, laughing, screaming sound and\nbackground noises. These sound files are transformed into spectrograms using\nShort-Time Fourier Transform, and then these image data are imported into a CNN\nneural network for classification, and the final trained model can achieve an\naccuracy of about 92% for sound detection.\n","authors":["Jiuqi Yan","Yingxian Chen","W. W. T. Fok"],"pdf_url":"https://arxiv.org/pdf/2307.15101v1.pdf","comment":"5 pages, 7 figures, PRAI 2023"},{"id":"http://arxiv.org/abs/2307.15099v1","updated":"2023-07-27T14:54:28Z","published":"2023-07-27T14:54:28Z","title":"Clustering of illustrations by atmosphere using a combination of\n  supervised and unsupervised learning","summary":"  The distribution of illustrations on social media, such as Twitter and Pixiv\nhas increased with the growing popularity of animation, games, and animated\nmovies. The \"atmosphere\" of illustrations plays an important role in user\npreferences. Classifying illustrations by atmosphere can be helpful for\nrecommendations and searches. However, assigning clear labels to the elusive\n\"atmosphere\" and conventional supervised classification is not always\npractical. Furthermore, even images with similar colors, edges, and low-level\nfeatures may not have similar atmospheres, making classification based on\nlow-level features challenging. In this paper, this problem is solved using\nboth supervised and unsupervised learning with pseudo-labels. The feature\nvectors are obtained using the supervised method with pseudo-labels that\ncontribute to an ambiguous atmosphere. Further, clustering is performed based\non these feature vectors. Experimental analyses show that our method\noutperforms conventional methods in human-like clustering on datasets manually\nclassified by humans.\n","authors":["Keisuke Kubota","Masahiro Okuda"],"pdf_url":"https://arxiv.org/pdf/2307.15099v1.pdf","comment":"5 pages, 2 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.15064v1","updated":"2023-07-27T17:59:59Z","published":"2023-07-27T17:59:59Z","title":"Self-Supervised Visual Acoustic Matching","summary":"  Acoustic matching aims to re-synthesize an audio clip to sound as if it were\nrecorded in a target acoustic environment. Existing methods assume access to\npaired training data, where the audio is observed in both source and target\nenvironments, but this limits the diversity of training data or requires the\nuse of simulated data or heuristics to create paired samples. We propose a\nself-supervised approach to visual acoustic matching where training samples\ninclude only the target scene image and audio -- without acoustically\nmismatched source audio for reference. Our approach jointly learns to\ndisentangle room acoustics and re-synthesize audio into the target environment,\nvia a conditional GAN framework and a novel metric that quantifies the level of\nresidual acoustic information in the de-biased audio. Training with either\nin-the-wild web data or simulated data, we demonstrate it outperforms the\nstate-of-the-art on multiple challenging datasets and a wide variety of\nreal-world audio and environments.\n","authors":["Arjun Somayazulu","Changan Chen","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2307.15064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15005v1","updated":"2023-07-27T17:04:05Z","published":"2023-07-27T17:04:05Z","title":"FLiCR: A Fast and Lightweight LiDAR Point Cloud Compression Based on\n  Lossy RI","summary":"  Light detection and ranging (LiDAR) sensors are becoming available on modern\nmobile devices and provide a 3D sensing capability. This new capability is\nbeneficial for perceptions in various use cases, but it is challenging for\nresource-constrained mobile devices to use the perceptions in real-time because\nof their high computational complexity. In this context, edge computing can be\nused to enable LiDAR online perceptions, but offloading the perceptions on the\nedge server requires a low-latency, lightweight, and efficient compression due\nto the large volume of LiDAR point clouds data.\n  This paper presents FLiCR, a fast and lightweight LiDAR point cloud\ncompression method for enabling edge-assisted online perceptions. FLiCR is\nbased on range images (RI) as an intermediate representation (IR), and\ndictionary coding for compressing RIs. FLiCR achieves its benefits by\nleveraging lossy RIs, and we show the efficiency of bytestream compression is\nlargely improved with quantization and subsampling. In addition, we identify\nthe limitation of current quality metrics for presenting the entropy of a point\ncloud, and introduce a new metric that reflects both point-wise and\nentropy-wise qualities for lossy IRs. The evaluation results show FLiCR is more\nsuitable for edge-assisted real-time perceptions than the existing LiDAR\ncompressions, and we demonstrate the effectiveness of our compression and\nmetric with the evaluations on 3D object detection and LiDAR SLAM.\n","authors":["Jin Heo","Christopher Phillips","Ada Gavrilovska"],"pdf_url":"https://arxiv.org/pdf/2307.15005v1.pdf","comment":"12 pages, 11 figures, conference paper"},{"id":"http://arxiv.org/abs/2307.14866v1","updated":"2023-07-27T13:52:42Z","published":"2023-07-27T13:52:42Z","title":"Sample Less, Learn More: Efficient Action Recognition via Frame Feature\n  Restoration","summary":"  Training an effective video action recognition model poses significant\ncomputational challenges, particularly under limited resource budgets. Current\nmethods primarily aim to either reduce model size or utilize pre-trained\nmodels, limiting their adaptability to various backbone architectures. This\npaper investigates the issue of over-sampled frames, a prevalent problem in\nmany approaches yet it has received relatively little attention. Despite the\nuse of fewer frames being a potential solution, this approach often results in\na substantial decline in performance. To address this issue, we propose a novel\nmethod to restore the intermediate features for two sparsely sampled and\nadjacent video frames. This feature restoration technique brings a negligible\nincrease in computational requirements compared to resource-intensive image\nencoders, such as ViT. To evaluate the effectiveness of our method, we conduct\nextensive experiments on four public datasets, including Kinetics-400,\nActivityNet, UCF-101, and HMDB-51. With the integration of our method, the\nefficiency of three commonly used baselines has been improved by over 50%, with\na mere 0.5% reduction in recognition accuracy. In addition, our method also\nsurprisingly helps improve the generalization ability of the models under\nzero-shot settings.\n","authors":["Harry Cheng","Yangyang Guo","Liqiang Nie","Zhiyong Cheng","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2307.14866v1.pdf","comment":"13 pages. Code and pretrained weight will be released at\n  https://github.com/xaCheng1996/SLLM"},{"id":"http://arxiv.org/abs/2307.14783v1","updated":"2023-07-27T11:24:47Z","published":"2023-07-27T11:24:47Z","title":"Emotion4MIDI: a Lyrics-based Emotion-Labeled Symbolic Music Dataset","summary":"  We present a new large-scale emotion-labeled symbolic music dataset\nconsisting of 12k MIDI songs. To create this dataset, we first trained emotion\nclassification models on the GoEmotions dataset, achieving state-of-the-art\nresults with a model half the size of the baseline. We then applied these\nmodels to lyrics from two large-scale MIDI datasets. Our dataset covers a wide\nrange of fine-grained emotions, providing a valuable resource to explore the\nconnection between music and emotions and, especially, to develop models that\ncan generate music based on specific emotions. Our code for inference, trained\nmodels, and datasets are available online.\n","authors":["Serkan Sulun","Pedro Oliveira","Paula Viana"],"pdf_url":"https://arxiv.org/pdf/2307.14783v1.pdf","comment":"Accepted to 22nd EPIA Conference on Artificial Intelligence (2023)"},{"id":"http://arxiv.org/abs/2301.09080v6","updated":"2023-07-27T07:50:46Z","published":"2023-01-22T08:35:51Z","title":"Dance2MIDI: Dance-driven multi-instruments music generation","summary":"  Dance-driven music generation aims to generate musical pieces conditioned on\ndance videos. Previous works focus on monophonic or raw audio generation, while\nthe multiinstruments scenario is under-explored. The challenges of the\ndance-driven multi-instruments music (MIDI) generation are two-fold: 1) no\npublicly available multi-instruments MIDI and video paired dataset and 2) the\nweak correlation between music and video. To tackle these challenges, we build\nthe first multi-instruments MIDI and dance paired dataset (D2MIDI). Based on\nour proposed dataset, we introduce a multi-instruments MIDI generation\nframework (Dance2MIDI) conditioned on dance video. Specifically, 1) to model\nthe correlation between music and dance, we encode the dance motion using the\nGCN, and 2) to generate harmonious and coherent music, we employ Transformer to\ndecode the MIDI sequence. We evaluate the generated music of our framework\ntrained on D2MIDI dataset and demonstrate that our method outperforms existing\nmethods. The data and code are available on the GitHub website.\n","authors":["Bo Han","Yi Ren","Yuheng Li"],"pdf_url":"https://arxiv.org/pdf/2301.09080v6.pdf","comment":"The reason for the withdrawal and retraction is due to recent\n  developments regarding the research presented in the manuscript. After\n  further investigation and reassessment, I have identified crucial issues with\n  the methodology and data used in the study. These concerns have raised doubts\n  about the accuracy and reliability of the findings presented in the\n  manuscript"},{"id":"http://arxiv.org/abs/2307.15097v1","updated":"2023-07-27T13:45:42Z","published":"2023-07-27T13:45:42Z","title":"Cascaded Cross-Modal Transformer for Request and Complaint Detection","summary":"  We propose a novel cascaded cross-modal transformer (CCMT) that combines\nspeech and text transcripts to detect customer requests and complaints in phone\nconversations. Our approach leverages a multimodal paradigm by transcribing the\nspeech using automatic speech recognition (ASR) models and translating the\ntranscripts into different languages. Subsequently, we combine\nlanguage-specific BERT-based models with Wav2Vec2.0 audio features in a novel\ncascaded cross-attention transformer model. We apply our system to the Requests\nSub-Challenge of the ACM Multimedia 2023 Computational Paralinguistics\nChallenge, reaching unweighted average recalls (UAR) of 65.41% and 85.87% for\nthe complaint and request classes, respectively.\n","authors":["Nicolae-Catalin Ristea","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2307.15097v1.pdf","comment":"Accepted at ACMMM 2023"}]},"2023-07-28T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.15703v1","updated":"2023-07-28T17:51:21Z","published":"2023-07-28T17:51:21Z","title":"Uncertainty in Natural Language Generation: From Theory to Applications","summary":"  Recent advances of powerful Language Models have allowed Natural Language\nGeneration (NLG) to emerge as an important technology that can not only perform\ntraditional tasks like summarisation or translation, but also serve as a\nnatural language interface to a variety of applications. As such, it is crucial\nthat NLG systems are trustworthy and reliable, for example by indicating when\nthey are likely to be wrong; and supporting multiple views, backgrounds and\nwriting styles -- reflecting diverse human sub-populations. In this paper, we\nargue that a principled treatment of uncertainty can assist in creating systems\nand evaluation protocols better aligned with these goals. We first present the\nfundamental theory, frameworks and vocabulary required to represent\nuncertainty. We then characterise the main sources of uncertainty in NLG from a\nlinguistic perspective, and propose a two-dimensional taxonomy that is more\ninformative and faithful than the popular aleatoric/epistemic dichotomy.\nFinally, we move from theory to applications and highlight exciting research\ndirections that exploit uncertainty to power decoding, controllable generation,\nself-assessment, selective answering, active learning and more.\n","authors":["Joris Baan","Nico Daheim","Evgenia Ilia","Dennis Ulmer","Haau-Sing Li","Raquel Fernández","Barbara Plank","Rico Sennrich","Chrysoula Zerva","Wilker Aziz"],"pdf_url":"https://arxiv.org/pdf/2307.15703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12794v2","updated":"2023-07-28T16:45:13Z","published":"2023-06-22T10:50:23Z","title":"Overview of Robust and Multilingual Automatic Evaluation Metrics\\\\for\n  Open-Domain Dialogue Systems at DSTC 11 Track 4","summary":"  The advent and fast development of neural networks have revolutionized the\nresearch on dialogue systems and subsequently have triggered various challenges\nregarding their automatic evaluation. Automatic evaluation of open-domain\ndialogue systems as an open challenge has been the center of the attention of\nmany researchers. Despite the consistent efforts to improve automatic metrics'\ncorrelations with human evaluation, there have been very few attempts to assess\ntheir robustness over multiple domains and dimensions. Also, their focus is\nmainly on the English language. All of these challenges prompt the development\nof automatic evaluation metrics that are reliable in various domains,\ndimensions, and languages. This track in the 11th Dialogue System Technology\nChallenge (DSTC11) is part of the ongoing effort to promote robust and\nmultilingual automatic evaluation metrics. This article describes the datasets\nand baselines provided to participants and discusses the submission and result\ndetails of the two proposed subtasks.\n","authors":["Mario Rodríguez-Cantelar","Chen Zhang","Chengguang Tang","Ke Shi","Sarik Ghazarian","João Sedoc","Luis Fernando D'Haro","Alexander Rudnicky"],"pdf_url":"https://arxiv.org/pdf/2306.12794v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15644v1","updated":"2023-07-28T16:03:28Z","published":"2023-07-28T16:03:28Z","title":"Scaling Data Generation in Vision-and-Language Navigation","summary":"  Recent research in language-guided visual navigation has demonstrated a\nsignificant demand for the diversity of traversable environments and the\nquantity of supervision for training generalizable agents. To tackle the common\ndata scarcity issue in existing vision-and-language navigation datasets, we\npropose an effective paradigm for generating large-scale data for learning,\nwhich applies 1200+ photo-realistic environments from HM3D and Gibson datasets\nand synthesizes 4.9 million instruction trajectory pairs using fully-accessible\nresources on the web. Importantly, we investigate the influence of each\ncomponent in this paradigm on the agent's performance and study how to\nadequately apply the augmented data to pre-train and fine-tune an agent. Thanks\nto our large-scale dataset, the performance of an existing agent can be pushed\nup (+11% absolute with regard to previous SoTA) to a significantly new best of\n80% single-run success rate on the R2R test split by simple imitation learning.\nThe long-lasting generalization gap between navigating in seen and unseen\nenvironments is also reduced to less than 1% (versus 8% in the previous best\nmethod). Moreover, our paradigm also facilitates different models to achieve\nnew state-of-the-art navigation results on CVDN, REVERIE, and R2R in continuous\nenvironments.\n","authors":["Zun Wang","Jialu Li","Yicong Hong","Yi Wang","Qi Wu","Mohit Bansal","Stephen Gould","Hao Tan","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2307.15644v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.15593v1","updated":"2023-07-28T14:52:08Z","published":"2023-07-28T14:52:08Z","title":"Robust Distortion-free Watermarks for Language Models","summary":"  We propose a methodology for planting watermarks in text from an\nautoregressive language model that are robust to perturbations without changing\nthe distribution over text up to a certain maximum generation budget. We\ngenerate watermarked text by mapping a sequence of random numbers -- which we\ncompute using a randomized watermark key -- to a sample from the language\nmodel. To detect watermarked text, any party who knows the key can align the\ntext to the random number sequence. We instantiate our watermark methodology\nwith two sampling schemes: inverse transform sampling and exponential minimum\nsampling. We apply these watermarks to three language models -- OPT-1.3B,\nLLaMA-7B and Alpaca-7B -- to experimentally validate their statistical power\nand robustness to various paraphrasing attacks. Notably, for both the OPT-1.3B\nand LLaMA-7B models, we find we can reliably detect watermarked text ($p \\leq\n0.01$) from $35$ tokens even after corrupting between $40$-$50$\\% of the tokens\nvia random edits (i.e., substitutions, insertions or deletions). For the\nAlpaca-7B model, we conduct a case study on the feasibility of watermarking\nresponses to typical user instructions. Due to the lower entropy of the\nresponses, detection is more difficult: around $25\\%$ of the responses -- whose\nmedian length is around $100$ tokens -- are detectable with $p \\leq 0.01$, and\nthe watermark is also less robust to certain automated paraphrasing attacks we\nimplement.\n","authors":["Rohith Kuditipudi","John Thickstun","Tatsunori Hashimoto","Percy Liang"],"pdf_url":"https://arxiv.org/pdf/2307.15593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15051v2","updated":"2023-07-28T14:45:50Z","published":"2023-07-27T17:56:56Z","title":"Matching Patients to Clinical Trials with Large Language Models","summary":"  Clinical trials are vital in advancing drug development and evidence-based\nmedicine, but their success is often hindered by challenges in patient\nrecruitment. In this work, we investigate the potential of large language\nmodels (LLMs) to assist individual patients and referral physicians in\nidentifying suitable clinical trials from an extensive selection. Specifically,\nwe introduce TrialGPT, a novel architecture employing LLMs to predict\ncriterion-level eligibility with detailed explanations, which are then\naggregated for ranking and excluding candidate clinical trials based on\nfree-text patient notes. We evaluate TrialGPT on three publicly available\ncohorts of 184 patients and 18,238 annotated clinical trials. The experimental\nresults demonstrate several key findings: First, TrialGPT achieves high\ncriterion-level prediction accuracy with faithful explanations. Second, the\naggregated trial-level TrialGPT scores are highly correlated with expert\neligibility annotations. Third, these scores prove effective in ranking\nclinical trials and exclude ineligible candidates. Our error analysis suggests\nthat current LLMs still make some mistakes due to limited medical knowledge and\ndomain-specific context understanding. Nonetheless, we believe the explanatory\ncapabilities of LLMs are highly valuable. Future research is warranted on how\nsuch AI assistants can be integrated into the routine trial matching workflow\nin real-world settings to improve its efficiency.\n","authors":["Qiao Jin","Zifeng Wang","Charalampos S. Floudas","Jimeng Sun","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.15051v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15582v1","updated":"2023-07-28T14:29:19Z","published":"2023-07-28T14:29:19Z","title":"When to generate hedges in peer-tutoring interactions","summary":"  This paper explores the application of machine learning techniques to predict\nwhere hedging occurs in peer-tutoring interactions. The study uses a\nnaturalistic face-to-face dataset annotated for natural language turns,\nconversational strategies, tutoring strategies, and nonverbal behaviours. These\nelements are processed into a vector representation of the previous turns,\nwhich serves as input to several machine learning models. Results show that\nembedding layers, that capture the semantic information of the previous turns,\nsignificantly improves the model's performance. Additionally, the study\nprovides insights into the importance of various features, such as\ninterpersonal rapport and nonverbal behaviours, in predicting hedges by using\nShapley values for feature explanation. We discover that the eye gaze of both\nthe tutor and the tutee has a significant impact on hedge prediction. We\nfurther validate this observation through a follow-up ablation study.\n","authors":["Alafate Abulimiti","Chloé Clavel","Justine Cassell"],"pdf_url":"https://arxiv.org/pdf/2307.15582v1.pdf","comment":"In Proceedings of the 16th Annual Conference ub Discourse and\n  Dialogue (SIGDIAL). Sept 11-15, Prague Czechia"},{"id":"http://arxiv.org/abs/2307.15555v1","updated":"2023-07-28T13:50:25Z","published":"2023-07-28T13:50:25Z","title":"All-for-One and One-For-All: Deep learning-based feature fusion for\n  Synthetic Speech Detection","summary":"  Recent advances in deep learning and computer vision have made the synthesis\nand counterfeiting of multimedia content more accessible than ever, leading to\npossible threats and dangers from malicious users. In the audio field, we are\nwitnessing the growth of speech deepfake generation techniques, which solicit\nthe development of synthetic speech detection algorithms to counter possible\nmischievous uses such as frauds or identity thefts. In this paper, we consider\nthree different feature sets proposed in the literature for the synthetic\nspeech detection task and present a model that fuses them, achieving overall\nbetter performances with respect to the state-of-the-art solutions. The system\nwas tested on different scenarios and datasets to prove its robustness to\nanti-forensic attacks and its generalization capabilities.\n","authors":["Daniele Mari","Davide Salvi","Paolo Bestagini","Simone Milani"],"pdf_url":"https://arxiv.org/pdf/2307.15555v1.pdf","comment":"Accepted at ECML-PKDD 2023 Workshop \"Deep Learning and Multimedia\n  Forensics. Combating fake media and misinformation\""},{"id":"http://arxiv.org/abs/2307.15554v1","updated":"2023-07-28T13:44:33Z","published":"2023-07-28T13:44:33Z","title":"'What are you referring to?' Evaluating the Ability of Multi-Modal\n  Dialogue Models to Process Clarificational Exchanges","summary":"  Referential ambiguities arise in dialogue when a referring expression does\nnot uniquely identify the intended referent for the addressee. Addressees\nusually detect such ambiguities immediately and work with the speaker to repair\nit using meta-communicative, Clarificational Exchanges (CE): a Clarification\nRequest (CR) and a response. Here, we argue that the ability to generate and\nrespond to CRs imposes specific constraints on the architecture and objective\nfunctions of multi-modal, visually grounded dialogue models. We use the SIMMC\n2.0 dataset to evaluate the ability of different state-of-the-art model\narchitectures to process CEs, with a metric that probes the contextual updates\nthat arise from them in the model. We find that language-based models are able\nto encode simple multi-modal semantic information and process some CEs,\nexcelling with those related to the dialogue history, whilst multi-modal models\ncan use additional learning objectives to obtain disentangled object\nrepresentations, which become crucial to handle complex referential ambiguities\nacross modalities overall.\n","authors":["Javier Chiyah-Garcia","Alessandro Suglia","Arash Eshghi","Helen Hastie"],"pdf_url":"https://arxiv.org/pdf/2307.15554v1.pdf","comment":"Accepted at SIGDIAL'23 (upcoming). Repository with code and\n  experiments available at https://github.com/JChiyah/what-are-you-referring-to"},{"id":"http://arxiv.org/abs/2201.05878v3","updated":"2023-07-28T13:33:45Z","published":"2022-01-15T15:58:44Z","title":"Automatic Lexical Simplification for Turkish","summary":"  In this paper, we present the first automatic lexical simplification system\nfor the Turkish language. Recent text simplification efforts rely on manually\ncrafted simplified corpora and comprehensive NLP tools that can analyse the\ntarget text both in word and sentence levels. Turkish is a morphologically rich\nagglutinative language that requires unique considerations such as the proper\nhandling of inflectional cases. Being a low-resource language in terms of\navailable resources and industrial-strength tools, it makes the text\nsimplification task harder to approach. We present a new text simplification\npipeline based on pretrained representation model BERT together with\nmorphological features to generate grammatically correct and semantically\nappropriate word-level simplifications.\n","authors":["Ahmet Yavuz Uluslu"],"pdf_url":"https://arxiv.org/pdf/2201.05878v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14850v2","updated":"2023-07-28T13:27:14Z","published":"2023-07-27T13:28:31Z","title":"Turkish Native Language Identification","summary":"  In this paper, we present the first application of Native Language\nIdentification (NLI) for the Turkish language. NLI involves predicting the\nwriter's first language by analysing their writing in different languages.\nWhile most NLI research has focused on English, our study extends its scope to\nTurkish. We used the recently constructed Turkish Learner Corpus and employed a\ncombination of three syntactic features (CFG production rules, part-of-speech\nn-grams, and function words) with L2 texts to demonstrate their effectiveness\nin this task.\n","authors":["Ahmet Yavuz Uluslu","Gerold Schneider"],"pdf_url":"https://arxiv.org/pdf/2307.14850v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.14272v2","updated":"2023-07-28T13:18:01Z","published":"2022-09-28T17:36:47Z","title":"Towards Multimodal Prediction of Spontaneous Humour: A Novel Dataset and\n  First Results","summary":"  Humour is a substantial element of human affect and cognition. Its automatic\nunderstanding can facilitate a more naturalistic human-device interaction and\nthe humanisation of artificial intelligence. Current methods of humour\ndetection are solely based on staged data making them inadequate for\n'real-world' applications. We address this deficiency by introducing the novel\nPassau-Spontaneous Football Coach Humour (Passau-SFCH) dataset, comprising of\nabout 11 hours of recordings. The Passau-SFCH dataset is annotated for the\npresence of humour and its dimensions (sentiment and direction) as proposed in\nMartin's Humor Style Questionnaire. We conduct a series of experiments,\nemploying pretrained Transformers, convolutional neural networks, and\nexpert-designed features. The performance of each modality (text, audio, video)\nfor spontaneous humour recognition is analysed and their complementarity is\ninvestigated. Our findings suggest that for the automatic analysis of humour\nand its sentiment, facial expressions are most promising, while humour\ndirection can be best modelled via text-based features. The results reveal\nconsiderable differences among various subjects, highlighting the individuality\nof humour usage and style. Further, we observe that a decision-level fusion\nyields the best recognition result. Finally, we make our code publicly\navailable at https://www.github.com/EIHW/passau-sfch. The Passau-SFCH dataset\nis available upon request.\n","authors":["Lukas Christ","Shahin Amiriparian","Alexander Kathan","Niklas Müller","Andreas König","Björn W. Schuller"],"pdf_url":"https://arxiv.org/pdf/2209.14272v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible (Major Revision)"},{"id":"http://arxiv.org/abs/2307.15543v1","updated":"2023-07-28T13:16:46Z","published":"2023-07-28T13:16:46Z","title":"Oracle Computability and Turing Reducibility in the Calculus of\n  Inductive Constructions","summary":"  We develop synthetic notions of oracle computability and Turing reducibility\nin the Calculus of Inductive Constructions (CIC), the constructive type theory\nunderlying the Coq proof assistant. As usual in synthetic approaches, we employ\na definition of oracle computations based on meta-level functions rather than\nobject-level models of computation, relying on the fact that in constructive\nsystems such as CIC all definable functions are computable by construction.\nSuch an approach lends itself well to machine-checked proofs, which we carry\nout in Coq.\n  There is a tension in finding a good synthetic rendering of the higher-order\nnotion of oracle computability. On the one hand, it has to be informative\nenough to prove central results, ensuring that all notions are faithfully\ncaptured. On the other hand, it has to be restricted enough to benefit from\naxioms for synthetic computability, which usually concern first-order objects.\nDrawing inspiration from a definition by Andrej Bauer based on continuous\nfunctions in the effective topos, we use a notion of sequential continuity to\ncharacterise valid oracle computations.\n  As main technical results, we show that Turing reducibility forms an upper\nsemilattice, transports decidability, and is strictly more expressive than\ntruth-table reducibility, and prove that whenever both a predicate $p$ and its\ncomplement are semi-decidable relative to an oracle $q$, then $p$\nTuring-reduces to $q$.\n","authors":["Yannick Forster","Dominik Kirst","Niklas Mück"],"pdf_url":"https://arxiv.org/pdf/2307.15543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15508v1","updated":"2023-07-28T12:08:15Z","published":"2023-07-28T12:08:15Z","title":"The Road to Quality is Paved with Good Revisions: A Detailed Evaluation\n  Methodology for Revision Policies in Incremental Sequence Labelling","summary":"  Incremental dialogue model components produce a sequence of output prefixes\nbased on incoming input. Mistakes can occur due to local ambiguities or to\nwrong hypotheses, making the ability to revise past outputs a desirable\nproperty that can be governed by a policy. In this work, we formalise and\ncharacterise edits and revisions in incremental sequence labelling and propose\nmetrics to evaluate revision policies. We then apply our methodology to profile\nthe incremental behaviour of three Transformer-based encoders in various tasks,\npaving the road for better revision policies.\n","authors":["Brielen Madureira","Patrick Kahardipraja","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2307.15508v1.pdf","comment":"Accepted at SIGdial 2023"},{"id":"http://arxiv.org/abs/2307.15504v1","updated":"2023-07-28T12:00:13Z","published":"2023-07-28T12:00:13Z","title":"Exploring Format Consistency for Instruction Tuning","summary":"  Instruction tuning has emerged as a promising approach to enhancing large\nlanguage models in following human instructions. It is shown that increasing\nthe diversity and number of instructions in the training data can consistently\nenhance generalization performance, which facilitates a recent endeavor to\ncollect various instructions and integrate existing instruction tuning datasets\ninto larger collections. However, different users have their unique ways of\nexpressing instructions, and there often exist variations across different\ndatasets in the instruction styles and formats, i.e., format inconsistency. In\nthis work, we study how format inconsistency may impact the performance of\ninstruction tuning. We propose a framework called \"Unified Instruction Tuning\"\n(UIT), which calls OpenAI APIs for automatic format transfer among different\ninstruction tuning datasets. We show that UIT successfully improves the\ngeneralization performance on unseen instructions, which highlights the\nimportance of format consistency for instruction tuning. To make the UIT\nframework more practical, we further propose a novel perplexity-based denoising\nmethod to reduce the noise of automatic format transfer. We also train a\nsmaller offline model that achieves comparable format transfer capability than\nOpenAI APIs to reduce costs in practice.\n","authors":["Shihao Liang","Kunlun Zhu","Runchu Tian","Yujia Qin","Huadong Wang","Xin Cong","Zhiyuan Liu","Xiaojiang Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2307.15504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15494v1","updated":"2023-07-28T11:42:31Z","published":"2023-07-28T11:42:31Z","title":"ETHER: Aligning Emergent Communication for Hindsight Experience Replay","summary":"  Natural language instruction following is paramount to enable collaboration\nbetween artificial agents and human beings. Natural language-conditioned\nreinforcement learning (RL) agents have shown how natural languages'\nproperties, such as compositionality, can provide a strong inductive bias to\nlearn complex policies. Previous architectures like HIGhER combine the benefit\nof language-conditioning with Hindsight Experience Replay (HER) to deal with\nsparse rewards environments. Yet, like HER, HIGhER relies on an oracle\npredicate function to provide a feedback signal highlighting which linguistic\ndescription is valid for which state. This reliance on an oracle limits its\napplication. Additionally, HIGhER only leverages the linguistic information\ncontained in successful RL trajectories, thus hurting its final performance and\ndata-efficiency. Without early successful trajectories, HIGhER is no better\nthan DQN upon which it is built. In this paper, we propose the Emergent Textual\nHindsight Experience Replay (ETHER) agent, which builds on HIGhER and addresses\nboth of its limitations by means of (i) a discriminative visual referential\ngame, commonly studied in the subfield of Emergent Communication (EC), used\nhere as an unsupervised auxiliary task and (ii) a semantic grounding scheme to\nalign the emergent language with the natural language of the\ninstruction-following benchmark. We show that the referential game's agents\nmake an artificial language emerge that is aligned with the natural-like\nlanguage used to describe goals in the BabyAI benchmark and that it is\nexpressive enough so as to also describe unsuccessful RL trajectories and thus\nprovide feedback to the RL agent to leverage the linguistic, structured\ninformation contained in all trajectories. Our work shows that EC is a viable\nunsupervised auxiliary task for RL and provides missing pieces to make HER more\nwidely applicable.\n","authors":["Kevin Denamganaï","Daniel Hernandez","Ozan Vardal","Sondess Missaoui","James Alfred Walker"],"pdf_url":"https://arxiv.org/pdf/2307.15494v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2307.15493v1","updated":"2023-07-28T11:38:05Z","published":"2023-07-28T11:38:05Z","title":"The timing bottleneck: Why timing and overlap are mission-critical for\n  conversational user interfaces, speech recognition and dialogue systems","summary":"  Speech recognition systems are a key intermediary in voice-driven\nhuman-computer interaction. Although speech recognition works well for pristine\nmonologic audio, real-life use cases in open-ended interactive settings still\npresent many challenges. We argue that timing is mission-critical for dialogue\nsystems, and evaluate 5 major commercial ASR systems for their conversational\nand multilingual support. We find that word error rates for natural\nconversational data in 6 languages remain abysmal, and that overlap remains a\nkey challenge (study 1). This impacts especially the recognition of\nconversational words (study 2), and in turn has dire consequences for\ndownstream intent recognition (study 3). Our findings help to evaluate the\ncurrent state of conversational ASR, contribute towards multidimensional error\nanalysis and evaluation, and identify phenomena that need most attention on the\nway to build robust interactive speech technologies.\n","authors":["Andreas Liesenfeld","Alianda Lopez","Mark Dingemanse"],"pdf_url":"https://arxiv.org/pdf/2307.15493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15484v1","updated":"2023-07-28T11:20:23Z","published":"2023-07-28T11:20:23Z","title":"Minimally-Supervised Speech Synthesis with Conditional Diffusion Model\n  and Language Model: A Comparative Study of Semantic Coding","summary":"  Recently, there has been a growing interest in text-to-speech (TTS) methods\nthat can be trained with minimal supervision by combining two types of discrete\nspeech representations and using two sequence-to-sequence tasks to decouple\nTTS. To address the challenges associated with high dimensionality and waveform\ndistortion in discrete representations, we propose Diff-LM-Speech, which models\nsemantic embeddings into mel-spectrogram based on diffusion models and\nintroduces a prompt encoder structure based on variational autoencoders and\nprosody bottlenecks to improve prompt representation capabilities.\nAutoregressive language models often suffer from missing and repeated words,\nwhile non-autoregressive frameworks face expression averaging problems due to\nduration prediction models. To address these issues, we propose\nTetra-Diff-Speech, which designs a duration diffusion model to achieve diverse\nprosodic expressions. While we expect the information content of semantic\ncoding to be between that of text and acoustic coding, existing models extract\nsemantic coding with a lot of redundant information and dimensionality\nexplosion. To verify that semantic coding is not necessary, we propose\nTri-Diff-Speech. Experimental results show that our proposed methods outperform\nbaseline methods. We provide a website with audio samples.\n","authors":["Chunyu Qiang","Hao Li","Hao Ni","He Qu","Ruibo Fu","Tao Wang","Longbiao Wang","Jianwu Dang"],"pdf_url":"https://arxiv.org/pdf/2307.15484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15460v1","updated":"2023-07-28T10:26:28Z","published":"2023-07-28T10:26:28Z","title":"Cross-Modal Concept Learning and Inference for Vision-Language Models","summary":"  Large-scale pre-trained Vision-Language Models (VLMs), such as CLIP,\nestablish the correlation between texts and images, achieving remarkable\nsuccess on various downstream tasks with fine-tuning. In existing fine-tuning\nmethods, the class-specific text description is matched against the whole\nimage. We recognize that this whole image matching is not effective since\nimages from the same class often contain a set of different semantic objects,\nand an object further consists of a set of semantic parts or concepts.\nIndividual semantic parts or concepts may appear in image samples from\ndifferent classes. To address this issue, in this paper, we develop a new\nmethod called cross-model concept learning and inference (CCLI). Using the\npowerful text-image correlation capability of CLIP, our method automatically\nlearns a large set of distinctive visual concepts from images using a set of\nsemantic text concepts. Based on these visual concepts, we construct a\ndiscriminative representation of images and learn a concept inference network\nto perform downstream image classification tasks, such as few-shot learning and\ndomain generalization. Extensive experimental results demonstrate that our CCLI\nmethod is able to improve the performance upon the current state-of-the-art\nmethods by large margins, for example, by up to 8.0% improvement on few-shot\nlearning and by up to 1.3% for domain generalization.\n","authors":["Yi Zhang","Ce Zhang","Yushun Tang","Zhihai He"],"pdf_url":"https://arxiv.org/pdf/2307.15460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15455v1","updated":"2023-07-28T10:17:30Z","published":"2023-07-28T10:17:30Z","title":"Trie-NLG: Trie Context Augmentation to Improve Personalized Query\n  Auto-Completion for Short and Unseen Prefixes","summary":"  Query auto-completion (QAC) aims at suggesting plausible completions for a\ngiven query prefix. Traditionally, QAC systems have leveraged tries curated\nfrom historical query logs to suggest most popular completions. In this\ncontext, there are two specific scenarios that are difficult to handle for any\nQAC system: short prefixes (which are inherently ambiguous) and unseen\nprefixes. Recently, personalized Natural Language Generation (NLG) models have\nbeen proposed to leverage previous session queries as context for addressing\nthese two challenges. However, such NLG models suffer from two drawbacks: (1)\nsome of the previous session queries could be noisy and irrelevant to the user\nintent for the current prefix, and (2) NLG models cannot directly incorporate\nhistorical query popularity. This motivates us to propose a novel NLG model for\nQAC, Trie-NLG, which jointly leverages popularity signals from trie and\npersonalization signals from previous session queries. We train the Trie-NLG\nmodel by augmenting the prefix with rich context comprising of recent session\nqueries and top trie completions. This simple modeling approach overcomes the\nlimitations of trie-based and NLG-based approaches and leads to\nstate-of-the-art performance. We evaluate the Trie-NLG model using two large\nQAC datasets. On average, our model achieves huge ~57% and ~14% boost in MRR\nover the popular trie-based lookup and the strong BART-based baseline methods,\nrespectively. We make our code publicly available.\n","authors":["Kaushal Kumar Maurya","Maunendra Sankar Desarkar","Manish Gupta","Puneet Agrawal"],"pdf_url":"https://arxiv.org/pdf/2307.15455v1.pdf","comment":"Accepted at Journal Track of ECML-PKDD 2023"},{"id":"http://arxiv.org/abs/2307.15453v1","updated":"2023-07-28T10:11:01Z","published":"2023-07-28T10:11:01Z","title":"From Probabilistic Programming to Complexity-based Programming","summary":"  The paper presents the main characteristics and a preliminary implementation\nof a novel computational framework named CompLog. Inspired by probabilistic\nprogramming systems like ProbLog, CompLog builds upon the inferential\nmechanisms proposed by Simplicity Theory, relying on the computation of two\nKolmogorov complexities (here implemented as min-path searches via ASP\nprograms) rather than probabilistic inference. The proposed system enables\nusers to compute ex-post and ex-ante measures of unexpectedness of a certain\nsituation, mapping respectively to posterior and prior subjective\nprobabilities. The computation is based on the specification of world and\nmental models by means of causal and descriptive relations between predicates\nweighted by complexity. The paper illustrates a few examples of application:\ngenerating relevant descriptions, and providing alternative approaches to\ndisjunction and to negation.\n","authors":["Giovanni Sileno","Jean-Louis Dessalles"],"pdf_url":"https://arxiv.org/pdf/2307.15453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.04702v2","updated":"2023-07-28T09:30:46Z","published":"2022-09-10T16:14:53Z","title":"Adaptive Meta-learner via Gradient Similarity for Few-shot Text\n  Classification","summary":"  Few-shot text classification aims to classify the text under the few-shot\nscenario. Most of the previous methods adopt optimization-based meta learning\nto obtain task distribution. However, due to the neglect of matching between\nthe few amount of samples and complicated models, as well as the distinction\nbetween useful and useless task features, these methods suffer from the\noverfitting issue. To address this issue, we propose a novel Adaptive\nMeta-learner via Gradient Similarity (AMGS) method to improve the model\ngeneralization ability to a new task. Specifically, the proposed AMGS\nalleviates the overfitting based on two aspects: (i) acquiring the potential\nsemantic representation of samples and improving model generalization through\nthe self-supervised auxiliary task in the inner loop, (ii) leveraging the\nadaptive meta-learner via gradient similarity to add constraints on the\ngradient obtained by base-learner in the outer loop. Moreover, we make a\nsystematic analysis of the influence of regularization on the entire framework.\nExperimental results on several benchmarks demonstrate that the proposed AMGS\nconsistently improves few-shot text classification performance compared with\nthe state-of-the-art optimization-based meta-learning approaches.\n","authors":["Tianyi Lei","Honghui Hu","Qiaoyang Luo","Dezhong Peng","Xu Wang"],"pdf_url":"https://arxiv.org/pdf/2209.04702v2.pdf","comment":"COLING 2022"},{"id":"http://arxiv.org/abs/2307.15432v1","updated":"2023-07-28T09:29:42Z","published":"2023-07-28T09:29:42Z","title":"CFN-ESA: A Cross-Modal Fusion Network with Emotion-Shift Awareness for\n  Dialogue Emotion Recognition","summary":"  Multimodal Emotion Recognition in Conversation (ERC) has garnered growing\nattention from research communities in various fields. In this paper, we\npropose a cross-modal fusion network with emotion-shift awareness (CFN-ESA) for\nERC. Extant approaches employ each modality equally without distinguishing the\namount of emotional information, rendering it hard to adequately extract\ncomplementary and associative information from multimodal data. To cope with\nthis problem, in CFN-ESA, textual modalities are treated as the primary source\nof emotional information, while visual and acoustic modalities are taken as the\nsecondary sources. Besides, most multimodal ERC models ignore emotion-shift\ninformation and overfocus on contextual information, leading to the failure of\nemotion recognition under emotion-shift scenario. We elaborate an emotion-shift\nmodule to address this challenge. CFN-ESA mainly consists of the unimodal\nencoder (RUME), cross-modal encoder (ACME), and emotion-shift module (LESM).\nRUME is applied to extract conversation-level contextual emotional cues while\npulling together the data distributions between modalities; ACME is utilized to\nperform multimodal interaction centered on textual modality; LESM is used to\nmodel emotion shift and capture related information, thereby guide the learning\nof the main task. Experimental results demonstrate that CFN-ESA can effectively\npromote performance for ERC and remarkably outperform the state-of-the-art\nmodels.\n","authors":["Jiang Li","Yingjian Liu","Xiaoping Wang","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2307.15432v1.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2303.11708v2","updated":"2023-07-28T09:22:02Z","published":"2023-03-21T10:01:49Z","title":"The Open-domain Paradox for Chatbots: Common Ground as the Basis for\n  Human-like Dialogue","summary":"  There is a surge in interest in the development of open-domain chatbots,\ndriven by the recent advancements of large language models. The \"openness\" of\nthe dialogue is expected to be maximized by providing minimal information to\nthe users about the common ground they can expect, including the presumed joint\nactivity. However, evidence suggests that the effect is the opposite. Asking\nusers to \"just chat about anything\" results in a very narrow form of dialogue,\nwhich we refer to as the \"open-domain paradox\". In this position paper, we\nexplain this paradox through the theory of common ground as the basis for\nhuman-like communication. Furthermore, we question the assumptions behind\nopen-domain chatbots and identify paths forward for enabling common ground in\nhuman-computer dialogue.\n","authors":["Gabriel Skantze","A. Seza Doğruöz"],"pdf_url":"https://arxiv.org/pdf/2303.11708v2.pdf","comment":"Accepted at SIGDIAL 2023"},{"id":"http://arxiv.org/abs/2307.15425v1","updated":"2023-07-28T09:20:22Z","published":"2023-07-28T09:20:22Z","title":"A Critical Review of Large Language Models: Sensitivity, Bias, and the\n  Path Toward Specialized AI","summary":"  This paper examines the comparative effectiveness of a specialized compiled\nlanguage model and a general-purpose model like OpenAI's GPT-3.5 in detecting\nSDGs within text data. It presents a critical review of Large Language Models\n(LLMs), addressing challenges related to bias and sensitivity. The necessity of\nspecialized training for precise, unbiased analysis is underlined. A case study\nusing a company descriptions dataset offers insight into the differences\nbetween the GPT-3.5 and the specialized SDG detection model. While GPT-3.5\nboasts broader coverage, it may identify SDGs with limited relevance to the\ncompanies' activities. In contrast, the specialized model zeroes in on highly\npertinent SDGs. The importance of thoughtful model selection is emphasized,\ntaking into account task requirements, cost, complexity, and transparency.\nDespite the versatility of LLMs, the use of specialized models is suggested for\ntasks demanding precision and accuracy. The study concludes by encouraging\nfurther research to find a balance between the capabilities of LLMs and the\nneed for domain-specific expertise and interpretability.\n","authors":["Arash Hajikhani","Carolyn Cole"],"pdf_url":"https://arxiv.org/pdf/2307.15425v1.pdf","comment":"17 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2307.15413v1","updated":"2023-07-28T09:06:50Z","published":"2023-07-28T09:06:50Z","title":"Improving Social Media Popularity Prediction with Multiple Post\n  Dependencies","summary":"  Social Media Popularity Prediction has drawn a lot of attention because of\nits profound impact on many different applications, such as recommendation\nsystems and multimedia advertising. Despite recent efforts to leverage the\ncontent of social media posts to improve prediction accuracy, many existing\nmodels fail to fully exploit the multiple dependencies between posts, which are\nimportant to comprehensively extract content information from posts. To tackle\nthis problem, we propose a novel prediction framework named Dependency-aware\nSequence Network (DSN) that exploits both intra- and inter-post dependencies.\nFor intra-post dependency, DSN adopts a multimodal feature extractor with an\nefficient fine-tuning strategy to obtain task-specific representations from\nimages and textual information of posts. For inter-post dependency, DSN uses a\nhierarchical information propagation method to learn category representations\nthat could better describe the difference between posts. DSN also exploits\nrecurrent networks with a series of gating layers for more flexible local\ntemporal processing abilities and multi-head attention for long-term\ndependencies. The experimental results on the Social Media Popularity Dataset\ndemonstrate the superiority of our method compared to existing state-of-the-art\nmodels.\n","authors":["Zhizhen Zhang","Xiaohui Xie","Mengyu Yang","Ye Tian","Yong Jiang","Yong Cui"],"pdf_url":"https://arxiv.org/pdf/2307.15413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15411v1","updated":"2023-07-28T09:03:19Z","published":"2023-07-28T09:03:19Z","title":"Investigating the Learning Behaviour of In-context Learning: A\n  Comparison with Supervised Learning","summary":"  Large language models (LLMs) have shown remarkable capacity for in-context\nlearning (ICL), where learning a new task from just a few training examples is\ndone without being explicitly pre-trained. However, despite the success of\nLLMs, there has been little understanding of how ICL learns the knowledge from\nthe given prompts. In this paper, to make progress toward understanding the\nlearning behaviour of ICL, we train the same LLMs with the same demonstration\nexamples via ICL and supervised learning (SL), respectively, and investigate\ntheir performance under label perturbations (i.e., noisy labels and label\nimbalance) on a range of classification tasks. First, via extensive\nexperiments, we find that gold labels have significant impacts on the\ndownstream in-context performance, especially for large language models;\nhowever, imbalanced labels matter little to ICL across all model sizes. Second,\nwhen comparing with SL, we show empirically that ICL is less sensitive to label\nperturbations than SL, and ICL gradually attains comparable performance to SL\nas the model size increases.\n","authors":["Xindi Wang","Yufei Wang","Can Xu","Xiubo Geng","Bowen Zhang","Chongyang Tao","Frank Rudzicz","Robert E. Mercer","Daxin Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.15411v1.pdf","comment":"accepted to ECAI 2023"},{"id":"http://arxiv.org/abs/2307.15410v1","updated":"2023-07-28T09:03:14Z","published":"2023-07-28T09:03:14Z","title":"Towards a Fully Unsupervised Framework for Intent Induction in Customer\n  Support Dialogues","summary":"  State of the art models in intent induction require annotated datasets.\nHowever, annotating dialogues is time-consuming, laborious and expensive. In\nthis work, we propose a completely unsupervised framework for intent induction\nwithin a dialogue. In addition, we show how pre-processing the dialogue corpora\ncan improve results. Finally, we show how to extract the dialogue flows of\nintentions by investigating the most common sequences. Although we test our\nwork in the MultiWOZ dataset, the fact that this framework requires no prior\nknowledge make it applicable to any possible use case, making it very relevant\nto real world customer support applications across industry.\n","authors":["Rita Costa","Bruno Martins","Sérgio Viana","Luisa Coheur"],"pdf_url":"https://arxiv.org/pdf/2307.15410v1.pdf","comment":"16 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.15376v1","updated":"2023-07-28T07:52:26Z","published":"2023-07-28T07:52:26Z","title":"Multilingual Tourist Assistance using ChatGPT: Comparing Capabilities in\n  Hindi, Telugu, and Kannada","summary":"  This research investigates the effectiveness of ChatGPT, an AI language model\nby OpenAI, in translating English into Hindi, Telugu, and Kannada languages,\naimed at assisting tourists in India's linguistically diverse environment. To\nmeasure the translation quality, a test set of 50 questions from diverse fields\nsuch as general knowledge, food, and travel was used. These were assessed by\nfive volunteers for accuracy and fluency, and the scores were subsequently\nconverted into a BLEU score. The BLEU score evaluates the closeness of a\nmachine-generated translation to a human translation, with a higher score\nindicating better translation quality. The Hindi translations outperformed\nothers, showcasing superior accuracy and fluency, whereas Telugu translations\nlagged behind. Human evaluators rated both the accuracy and fluency of\ntranslations, offering a comprehensive perspective on the language model's\nperformance.\n","authors":["Sanjana Kolar","Rohit Kumar"],"pdf_url":"https://arxiv.org/pdf/2307.15376v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2305.04076v2","updated":"2023-07-28T06:51:56Z","published":"2023-05-06T15:48:24Z","title":"SANTA: Separate Strategies for Inaccurate and Incomplete Annotation\n  Noise in Distantly-Supervised Named Entity Recognition","summary":"  Distantly-Supervised Named Entity Recognition effectively alleviates the\nburden of time-consuming and expensive annotation in the supervised setting.\nBut the context-free matching process and the limited coverage of knowledge\nbases introduce inaccurate and incomplete annotation noise respectively.\nPrevious studies either considered only incomplete annotation noise or\nindiscriminately handle two types of noise with the same strategy. In this\npaper, we argue that the different causes of two types of noise bring up the\nrequirement of different strategies in model architecture. Therefore, we\npropose the SANTA to handle these two types of noise separately with (1)\nMemory-smoothed Focal Loss and Entity-aware KNN to relieve the entity ambiguity\nproblem caused by inaccurate annotation, and (2) Boundary Mixup to alleviate\ndecision boundary shifting problem caused by incomplete annotation and a\nnoise-tolerant loss to improve the robustness. Benefiting from our separate\ntailored strategies, we confirm in the experiment that the two types of noise\nare well mitigated. SANTA also achieves a new state-of-the-art on five public\ndatasets.\n","authors":["Shuzheng Si","Zefan Cai","Shuang Zeng","Guoqiang Feng","Jiaxing Lin","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2305.04076v2.pdf","comment":"Findings of ACL2023"},{"id":"http://arxiv.org/abs/2307.15343v1","updated":"2023-07-28T06:43:04Z","published":"2023-07-28T06:43:04Z","title":"Med-HALT: Medical Domain Hallucination Test for Large Language Models","summary":"  This research paper focuses on the challenges posed by hallucinations in\nlarge language models (LLMs), particularly in the context of the medical\ndomain. Hallucination, wherein these models generate plausible yet unverified\nor incorrect information, can have serious consequences in healthcare\napplications. We propose a new benchmark and dataset, Med-HALT (Medical Domain\nHallucination Test), designed specifically to evaluate and reduce\nhallucinations. Med-HALT provides a diverse multinational dataset derived from\nmedical examinations across various countries and includes multiple innovative\ntesting modalities. Med-HALT includes two categories of tests reasoning and\nmemory-based hallucination tests, designed to assess LLMs's problem-solving and\ninformation retrieval abilities.\n  Our study evaluated leading LLMs, including Text Davinci, GPT-3.5, LlaMa-2,\nMPT, and Falcon, revealing significant differences in their performance. The\npaper provides detailed insights into the dataset, promoting transparency and\nreproducibility. Through this work, we aim to contribute to the development of\nsafer and more reliable language models in healthcare. Our benchmark can be\nfound at medhalt.github.io\n","authors":["Logesh Kumar Umapathi","Ankit Pal","Malaikannan Sankarasubbu"],"pdf_url":"https://arxiv.org/pdf/2307.15343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15341v1","updated":"2023-07-28T06:33:09Z","published":"2023-07-28T06:33:09Z","title":"Teach Me How to Improve My Argumentation Skills: A Survey on Feedback in\n  Argumentation","summary":"  The use of argumentation in education has been shown to improve critical\nthinking skills for end-users such as students, and computational models for\nargumentation have been developed to assist in this process. Although these\nmodels are useful for evaluating the quality of an argument, they oftentimes\ncannot explain why a particular argument is considered poor or not, which makes\nit difficult to provide constructive feedback to users to strengthen their\ncritical thinking skills. In this survey, we aim to explore the different\ndimensions of feedback (Richness, Visualization, Interactivity, and\nPersonalization) provided by the current computational models for\nargumentation, and the possibility of enhancing the power of explanations of\nsuch models, ultimately helping learners improve their critical thinking\nskills.\n","authors":["Camélia Guerraoui","Paul Reisert","Naoya Inoue","Farjana Sultana Mim","Shoichi Naito","Jungmin Choi","Irfan Robbani","Wenzhi Wang","Kentaro Inui"],"pdf_url":"https://arxiv.org/pdf/2307.15341v1.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.15337v1","updated":"2023-07-28T06:31:34Z","published":"2023-07-28T06:31:34Z","title":"Skeleton-of-Thought: Large Language Models Can Do Parallel Decoding","summary":"  This work aims at decreasing the end-to-end generation latency of large\nlanguage models (LLMs). One of the major causes of the high generation latency\nis the sequential decoding approach adopted by almost all state-of-the-art\nLLMs. In this work, motivated by the thinking and writing process of humans, we\npropose \"Skeleton-of-Thought\" (SoT), which guides LLMs to first generate the\nskeleton of the answer, and then conducts parallel API calls or batched\ndecoding to complete the contents of each skeleton point in parallel. Not only\ndoes SoT provide considerable speed-up (up to 2.39x across 11 different LLMs),\nbut it can also potentially improve the answer quality on several question\ncategories in terms of diversity and relevance. SoT is an initial attempt at\ndata-centric optimization for efficiency, and reveal the potential of pushing\nLLMs to think more like a human for answer quality.\n","authors":["Xuefei Ning","Zinan Lin","Zixuan Zhou","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2307.15337v1.pdf","comment":"Technical report, work in progress"},{"id":"http://arxiv.org/abs/2307.15335v1","updated":"2023-07-28T06:23:32Z","published":"2023-07-28T06:23:32Z","title":"BARTPhoBEiT: Pre-trained Sequence-to-Sequence and Image Transformers\n  Models for Vietnamese Visual Question Answering","summary":"  Visual Question Answering (VQA) is an intricate and demanding task that\nintegrates natural language processing (NLP) and computer vision (CV),\ncapturing the interest of researchers. The English language, renowned for its\nwealth of resources, has witnessed notable advancements in both datasets and\nmodels designed for VQA. However, there is a lack of models that target\nspecific countries such as Vietnam. To address this limitation, we introduce a\ntransformer-based Vietnamese model named BARTPhoBEiT. This model includes\npre-trained Sequence-to-Sequence and bidirectional encoder representation from\nImage Transformers in Vietnamese and evaluates Vietnamese VQA datasets.\nExperimental results demonstrate that our proposed model outperforms the strong\nbaseline and improves the state-of-the-art in six metrics: Accuracy, Precision,\nRecall, F1-score, WUPS 0.0, and WUPS 0.9.\n","authors":["Khiem Vinh Tran","Kiet Van Nguyen","Ngan Luu Thuy Nguyen"],"pdf_url":"https://arxiv.org/pdf/2307.15335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15331v1","updated":"2023-07-28T06:15:27Z","published":"2023-07-28T06:15:27Z","title":"Tutorials on Stance Detection using Pre-trained Language Models:\n  Fine-tuning BERT and Prompting Large Language Models","summary":"  This paper presents two self-contained tutorials on stance detection in\nTwitter data using BERT fine-tuning and prompting large language models (LLMs).\nThe first tutorial explains BERT architecture and tokenization, guiding users\nthrough training, tuning, and evaluating standard and domain-specific BERT\nmodels with HuggingFace transformers. The second focuses on constructing\nprompts and few-shot examples to elicit stances from ChatGPT and open-source\nFLAN-T5 without fine-tuning. Various prompting strategies are implemented and\nevaluated using confusion matrices and macro F1 scores. The tutorials provide\ncode, visualizations, and insights revealing the strengths of few-shot ChatGPT\nand FLAN-T5 which outperform fine-tuned BERTs. By covering both model\nfine-tuning and prompting-based techniques in an accessible, hands-on manner,\nthese tutorials enable learners to gain applied experience with cutting-edge\nmethods for stance detection.\n","authors":["Yun-Shiuan Chuang"],"pdf_url":"https://arxiv.org/pdf/2307.15331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10270v2","updated":"2023-07-28T05:35:32Z","published":"2023-05-17T15:02:20Z","title":"Boosting Local Spectro-Temporal Features for Speech Analysis","summary":"  We introduce the problem of phone classification in the context of speech\nrecognition, and explore several sets of local spectro-temporal features that\ncan be used for phone classification. In particular, we present some\npreliminary results for phone classification using two sets of features that\nare commonly used for object detection: Haar features and SVM-classified\nHistograms of Gradients (HoG).\n","authors":["Michael Guerzhoy"],"pdf_url":"https://arxiv.org/pdf/2305.10270v2.pdf","comment":"Master's project, University of Toronto, 2010"},{"id":"http://arxiv.org/abs/2307.15311v1","updated":"2023-07-28T05:17:11Z","published":"2023-07-28T05:17:11Z","title":"TrafficSafetyGPT: Tuning a Pre-trained Large Language Model to a\n  Domain-Specific Expert in Transportation Safety","summary":"  Large Language Models (LLMs) have shown remarkable effectiveness in various\ngeneral-domain natural language processing (NLP) tasks. However, their\nperformance in transportation safety domain tasks has been suboptimal,\nprimarily attributed to the requirement for specialized transportation safety\nexpertise in generating accurate responses [1]. To address this challenge, we\nintroduce TrafficSafetyGPT, a novel LLAMA-based model, which has undergone\nsupervised fine-tuning using TrafficSafety-2K dataset which has human labels\nfrom government produced guiding books and ChatGPT-generated instruction-output\npairs. Our proposed TrafficSafetyGPT model and TrafficSafety-2K train dataset\nare accessible at https://github.com/ozheng1993/TrafficSafetyGPT.\n","authors":["Ou Zheng","Mohamed Abdel-Aty","Dongdong Wang","Chenzhu Wang","Shengxuan Ding"],"pdf_url":"https://arxiv.org/pdf/2307.15311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15293v1","updated":"2023-07-28T04:17:41Z","published":"2023-07-28T04:17:41Z","title":"WC-SBERT: Zero-Shot Text Classification via SBERT with Self-Training for\n  Wikipedia Categories","summary":"  Our research focuses on solving the zero-shot text classification problem in\nNLP, with a particular emphasis on innovative self-training strategies. To\nachieve this objective, we propose a novel self-training strategy that uses\nlabels rather than text for training, significantly reducing the model's\ntraining time. Specifically, we use categories from Wikipedia as our training\nset and leverage the SBERT pre-trained model to establish positive correlations\nbetween pairs of categories within the same text, facilitating associative\ntraining. For new test datasets, we have improved the original self-training\napproach, eliminating the need for prior training and testing data from each\ntarget dataset. Instead, we adopt Wikipedia as a unified training dataset to\nbetter approximate the zero-shot scenario. This modification allows for rapid\nfine-tuning and inference across different datasets, greatly reducing the time\nrequired for self-training. Our experimental results demonstrate that this\nmethod can adapt the model to the target dataset within minutes. Compared to\nother BERT-based transformer models, our approach significantly reduces the\namount of training data by training only on labels, not the actual text, and\ngreatly improves training efficiency by utilizing a unified training set.\nAdditionally, our method achieves state-of-the-art results on both the Yahoo\nTopic and AG News datasets.\n","authors":["Te-Yu Chi","Yu-Meng Tang","Chia-Wen Lu","Qiu-Xia Zhang","Jyh-Shing Roger Jang"],"pdf_url":"https://arxiv.org/pdf/2307.15293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15290v1","updated":"2023-07-28T04:04:43Z","published":"2023-07-28T04:04:43Z","title":"ChatHome: Development and Evaluation of a Domain-Specific Language Model\n  for Home Renovation","summary":"  This paper presents the development and evaluation of ChatHome, a\ndomain-specific language model (DSLM) designed for the intricate field of home\nrenovation. Considering the proven competencies of large language models (LLMs)\nlike GPT-4 and the escalating fascination with home renovation, this study\nendeavors to reconcile these aspects by generating a dedicated model that can\nyield high-fidelity, precise outputs relevant to the home renovation arena.\nChatHome's novelty rests on its methodology, fusing domain-adaptive pretraining\nand instruction-tuning over an extensive dataset. This dataset includes\nprofessional articles, standard documents, and web content pertinent to home\nrenovation. This dual-pronged strategy is designed to ensure that our model can\nassimilate comprehensive domain knowledge and effectively address user\ninquiries. Via thorough experimentation on diverse datasets, both universal and\ndomain-specific, including the freshly introduced \"EvalHome\" domain dataset, we\nsubstantiate that ChatHome not only amplifies domain-specific functionalities\nbut also preserves its versatility.\n","authors":["Cheng Wen","Xianghui Sun","Shuaijiang Zhao","Xiaoquan Fang","Liangyu Chen","Wei Zou"],"pdf_url":"https://arxiv.org/pdf/2307.15290v1.pdf","comment":"ChatHome,DSLM for home renovation"},{"id":"http://arxiv.org/abs/2307.15286v1","updated":"2023-07-28T03:47:44Z","published":"2023-07-28T03:47:44Z","title":"Multilingual Lexical Simplification via Paraphrase Generation","summary":"  Lexical simplification (LS) methods based on pretrained language models have\nmade remarkable progress, generating potential substitutes for a complex word\nthrough analysis of its contextual surroundings. However, these methods require\nseparate pretrained models for different languages and disregard the\npreservation of sentence meaning. In this paper, we propose a novel\nmultilingual LS method via paraphrase generation, as paraphrases provide\ndiversity in word selection while preserving the sentence's meaning. We regard\nparaphrasing as a zero-shot translation task within multilingual neural machine\ntranslation that supports hundreds of languages. After feeding the input\nsentence into the encoder of paraphrase modeling, we generate the substitutes\nbased on a novel decoding strategy that concentrates solely on the lexical\nvariations of the complex word. Experimental results demonstrate that our\napproach surpasses BERT-based methods and zero-shot GPT3-based method\nsignificantly on English, Spanish, and Portuguese.\n","authors":["Kang Liu","Jipeng Qiang","Yun Li","Yunhao Yuan","Yi Zhu","Kaixun Hua"],"pdf_url":"https://arxiv.org/pdf/2307.15286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13692v2","updated":"2023-07-28T03:31:08Z","published":"2023-07-25T17:55:19Z","title":"ARB: Advanced Reasoning Benchmark for Large Language Models","summary":"  Large Language Models (LLMs) have demonstrated remarkable performance on\nvarious quantitative reasoning and knowledge benchmarks. However, many of these\nbenchmarks are losing utility as LLMs get increasingly high scores, despite not\nyet reaching expert performance in these domains. We introduce ARB, a novel\nbenchmark composed of advanced reasoning problems in multiple fields. ARB\npresents a more challenging test than prior benchmarks, featuring problems in\nmathematics, physics, biology, chemistry, and law. As a subset of ARB, we\nintroduce a challenging set of math and physics problems which require advanced\nsymbolic reasoning and domain knowledge. We evaluate recent models such as\nGPT-4 and Claude on ARB and demonstrate that current models score well below\n50% on more demanding tasks. In order to improve both automatic and assisted\nevaluation capabilities, we introduce a rubric-based evaluation approach,\nallowing GPT-4 to score its own intermediate reasoning steps. Further, we\nconduct a human evaluation of the symbolic subset of ARB, finding promising\nagreement between annotators and GPT-4 rubric evaluation scores.\n","authors":["Tomohiro Sawada","Daniel Paleka","Alexander Havrilla","Pranav Tadepalli","Paula Vidas","Alexander Kranias","John J. Nay","Kshitij Gupta","Aran Komatsuzaki"],"pdf_url":"https://arxiv.org/pdf/2307.13692v2.pdf","comment":"Submitted to NeurIPS Datasets and Benchmarks Track"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.15710v1","updated":"2023-07-28T17:59:03Z","published":"2023-07-28T17:59:03Z","title":"Semi-Supervised Object Detection in the Open World","summary":"  Existing approaches for semi-supervised object detection assume a fixed set\nof classes present in training and unlabeled datasets, i.e., in-distribution\n(ID) data. The performance of these techniques significantly degrades when\nthese techniques are deployed in the open-world, due to the fact that the\nunlabeled and test data may contain objects that were not seen during training,\ni.e., out-of-distribution (OOD) data. The two key questions that we explore in\nthis paper are: can we detect these OOD samples and if so, can we learn from\nthem? With these considerations in mind, we propose the Open World\nSemi-supervised Detection framework (OWSSD) that effectively detects OOD data\nalong with a semi-supervised learning pipeline that learns from both ID and OOD\ndata. We introduce an ensemble based OOD detector consisting of lightweight\nauto-encoder networks trained only on ID data. Through extensive evalulation,\nwe demonstrate that our method performs competitively against state-of-the-art\nOOD detection algorithms and also significantly boosts the semi-supervised\nlearning performance in open-world scenarios.\n","authors":["Garvita Allabadi","Ana Lucic","Peter Pao-Huang","Yu-Xiong Wang","Vikram Adve"],"pdf_url":"https://arxiv.org/pdf/2307.15710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15700v1","updated":"2023-07-28T17:50:09Z","published":"2023-07-28T17:50:09Z","title":"MeMOTR: Long-Term Memory-Augmented Transformer for Multi-Object Tracking","summary":"  As a video task, Multi-Object Tracking (MOT) is expected to capture temporal\ninformation of targets effectively. Unfortunately, most existing methods only\nexplicitly exploit the object features between adjacent frames, while lacking\nthe capacity to model long-term temporal information. In this paper, we propose\nMeMOTR, a long-term memory-augmented Transformer for multi-object tracking. Our\nmethod is able to make the same object's track embedding more stable and\ndistinguishable by leveraging long-term memory injection with a customized\nmemory-attention layer. This significantly improves the target association\nability of our model. Experimental results on DanceTrack show that MeMOTR\nimpressively surpasses the state-of-the-art method by 7.9\\% and 13.0\\% on HOTA\nand AssA metrics, respectively. Furthermore, our model also outperforms other\nTransformer-based methods on association performance on MOT17 and generalizes\nwell on BDD100K. Code is available at\n\\href{https://github.com/MCG-NJU/MeMOTR}{https://github.com/MCG-NJU/MeMOTR}.\n","authors":["Ruopeng Gao","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2307.15700v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.15697v1","updated":"2023-07-28T17:46:00Z","published":"2023-07-28T17:46:00Z","title":"SimDETR: Simplifying self-supervised pretraining for DETR","summary":"  DETR-based object detectors have achieved remarkable performance but are\nsample-inefficient and exhibit slow convergence. Unsupervised pretraining has\nbeen found to be helpful to alleviate these impediments, allowing training with\nlarge amounts of unlabeled data to improve the detector's performance. However,\nexisting methods have their own limitations, like keeping the detector's\nbackbone frozen in order to avoid performance degradation and utilizing\npretraining objectives misaligned with the downstream task. To overcome these\nlimitations, we propose a simple pretraining framework for DETR-based detectors\nthat consists of three simple yet key ingredients: (i) richer, semantics-based\ninitial proposals derived from high-level feature maps, (ii) discriminative\ntraining using object pseudo-labels produced via clustering, (iii)\nself-training to take advantage of the improved object proposals learned by the\ndetector. We report two main findings: (1) Our pretraining outperforms prior\nDETR pretraining works on both the full and low data regimes by significant\nmargins. (2) We show we can pretrain DETR from scratch (including the backbone)\ndirectly on complex image datasets like COCO, paving the path for unsupervised\nrepresentation learning directly using DETR.\n","authors":["Ioannis Maniadis Metaxas","Adrian Bulat","Ioannis Patras","Brais Martinez","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2307.15697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09394v2","updated":"2023-07-28T17:41:16Z","published":"2023-02-18T18:00:05Z","title":"Deep Neural Networks based Meta-Learning for Network Intrusion Detection","summary":"  The digitization of different components of industry and inter-connectivity\namong indigenous networks have increased the risk of network attacks. Designing\nan intrusion detection system to ensure security of the industrial ecosystem is\ndifficult as network traffic encompasses various attack types, including new\nand evolving ones with minor changes. The data used to construct a predictive\nmodel for computer networks has a skewed class distribution and limited\nrepresentation of attack types, which differ from real network traffic. These\nlimitations result in dataset shift, negatively impacting the machine learning\nmodels' predictive abilities and reducing the detection rate against novel\nattacks. To address the challenges, we propose a novel deep neural network\nbased Meta-Learning framework; INformation FUsion and Stacking Ensemble\n(INFUSE) for network intrusion detection. First, a hybrid feature space is\ncreated by integrating decision and feature spaces. Five different classifiers\nare utilized to generate a pool of decision spaces. The feature space is then\nenriched through a deep sparse autoencoder that learns the semantic\nrelationships between attacks. Finally, the deep Meta-Learner acts as an\nensemble combiner to analyze the hybrid feature space and make a final\ndecision. Our evaluation on stringent benchmark datasets and comparison to\nexisting techniques showed the effectiveness of INFUSE with an F-Score of 0.91,\nAccuracy of 91.6%, and Recall of 0.94 on the Test+ dataset, and an F-Score of\n0.91, Accuracy of 85.6%, and Recall of 0.87 on the stringent Test-21 dataset.\nThese promising results indicate the strong generalization capability and the\npotential to detect network attacks.\n","authors":["Anabia Sohail","Bibi Ayisha","Irfan Hameed","Muhammad Mohsin Zafar","Hani Alquhayz","Asifullah Khan"],"pdf_url":"https://arxiv.org/pdf/2302.09394v2.pdf","comment":"Pages: 15, Figures: 10 and Tables: 9"},{"id":"http://arxiv.org/abs/2307.15692v1","updated":"2023-07-28T17:37:53Z","published":"2023-07-28T17:37:53Z","title":"PatchMixer: Rethinking network design to boost generalization for 3D\n  point cloud understanding","summary":"  The recent trend in deep learning methods for 3D point cloud understanding is\nto propose increasingly sophisticated architectures either to better capture 3D\ngeometries or by introducing possibly undesired inductive biases. Moreover,\nprior works introducing novel architectures compared their performance on the\nsame domain, devoting less attention to their generalization to other domains.\nWe argue that the ability of a model to transfer the learnt knowledge to\ndifferent domains is an important feature that should be evaluated to\nexhaustively assess the quality of a deep network architecture. In this work we\npropose PatchMixer, a simple yet effective architecture that extends the ideas\nbehind the recent MLP-Mixer paper to 3D point clouds. The novelties of our\napproach are the processing of local patches instead of the whole shape to\npromote robustness to partial point clouds, and the aggregation of patch-wise\nfeatures using an MLP as a simpler alternative to the graph convolutions or the\nattention mechanisms that are used in prior works. We evaluated our method on\nthe shape classification and part segmentation tasks, achieving superior\ngeneralization performance compared to a selection of the most relevant deep\narchitectures.\n","authors":["Davide Boscaini","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2307.15692v1.pdf","comment":"Published in the Image and Vision Computing journal"},{"id":"http://arxiv.org/abs/2307.15671v1","updated":"2023-07-28T17:03:00Z","published":"2023-07-28T17:03:00Z","title":"TrackAgent: 6D Object Tracking via Reinforcement Learning","summary":"  Tracking an object's 6D pose, while either the object itself or the observing\ncamera is moving, is important for many robotics and augmented reality\napplications. While exploiting temporal priors eases this problem,\nobject-specific knowledge is required to recover when tracking is lost. Under\nthe tight time constraints of the tracking task, RGB(D)-based methods are often\nconceptionally complex or rely on heuristic motion models. In comparison, we\npropose to simplify object tracking to a reinforced point cloud (depth only)\nalignment task. This allows us to train a streamlined approach from scratch\nwith limited amounts of sparse 3D point clouds, compared to the large datasets\nof diverse RGBD sequences required in previous works. We incorporate temporal\nframe-to-frame registration with object-based recovery by frame-to-model\nrefinement using a reinforcement learning (RL) agent that jointly solves for\nboth objectives. We also show that the RL agent's uncertainty and a\nrendering-based mask propagation are effective reinitialization triggers.\n","authors":["Konstantin Röhrl","Dominik Bauer","Timothy Patten","Markus Vincze"],"pdf_url":"https://arxiv.org/pdf/2307.15671v1.pdf","comment":"International Conference on Computer Vision Systems (ICVS) 2023"},{"id":"http://arxiv.org/abs/2212.14197v3","updated":"2023-07-28T16:42:44Z","published":"2022-12-29T07:03:29Z","title":"Self-Supervised Pre-training for 3D Point Clouds via View-Specific\n  Point-to-Image Translation","summary":"  The past few years have witnessed the great success and prevalence of\nself-supervised representation learning within the language and 2D vision\ncommunities. However, such advancements have not been fully migrated to the\nfield of 3D point cloud learning. Different from existing pre-training\nparadigms designed for deep point cloud feature extractors that fall into the\nscope of generative modeling or contrastive learning, this paper proposes a\ntranslative pre-training framework, namely PointVST, driven by a novel\nself-supervised pretext task of cross-modal translation from 3D point clouds to\ntheir corresponding diverse forms of 2D rendered images. More specifically, we\nbegin with deducing view-conditioned point-wise embeddings through the\ninsertion of the viewpoint indicator, and then adaptively aggregate a\nview-specific global codeword, which can be further fed into subsequent 2D\nconvolutional translation heads for image generation. Extensive experimental\nevaluations on various downstream task scenarios demonstrate that our PointVST\nshows consistent and prominent performance superiority over current\nstate-of-the-art approaches as well as satisfactory domain transfer capability.\nOur code will be publicly available at https://github.com/keeganhk/PointVST.\n","authors":["Qijian Zhang","Junhui Hou"],"pdf_url":"https://arxiv.org/pdf/2212.14197v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10712v4","updated":"2023-07-28T16:37:07Z","published":"2023-04-21T02:53:56Z","title":"Adversarial Infrared Blocks: A Multi-view Black-box Attack to Thermal\n  Infrared Detectors in Physical World","summary":"  Infrared imaging systems have a vast array of potential applications in\npedestrian detection and autonomous driving, and their safety performance is of\ngreat concern. However, few studies have explored the safety of infrared\nimaging systems in real-world settings. Previous research has used physical\nperturbations such as small bulbs and thermal \"QR codes\" to attack infrared\nimaging detectors, but such methods are highly visible and lack stealthiness.\nOther researchers have used hot and cold blocks to deceive infrared imaging\ndetectors, but this method is limited in its ability to execute attacks from\nvarious angles. To address these shortcomings, we propose a novel physical\nattack called adversarial infrared blocks (AdvIB). By optimizing the physical\nparameters of the adversarial infrared blocks, this method can execute a\nstealthy black-box attack on thermal imaging system from various angles. We\nevaluate the proposed method based on its effectiveness, stealthiness, and\nrobustness. Our physical tests show that the proposed method achieves a success\nrate of over 80% under most distance and angle conditions, validating its\neffectiveness. For stealthiness, our method involves attaching the adversarial\ninfrared block to the inside of clothing, enhancing its stealthiness.\nAdditionally, we test the proposed method on advanced detectors, and\nexperimental results demonstrate an average attack success rate of 51.2%,\nproving its robustness. Overall, our proposed AdvIB method offers a promising\navenue for conducting stealthy, effective and robust black-box attacks on\nthermal imaging system, with potential implications for real-world safety and\nsecurity applications.\n","authors":["Chengyin Hu","Weiwen Shi","Tingsong Jiang","Wen Yao","Ling Tian","Xiaoqian Chen"],"pdf_url":"https://arxiv.org/pdf/2304.10712v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15647v1","updated":"2023-07-28T16:08:10Z","published":"2023-07-28T16:08:10Z","title":"Multi-layer Aggregation as a key to feature-based OOD detection","summary":"  Deep Learning models are easily disturbed by variations in the input images\nthat were not observed during the training stage, resulting in unpredictable\npredictions. Detecting such Out-of-Distribution (OOD) images is particularly\ncrucial in the context of medical image analysis, where the range of possible\nabnormalities is extremely wide. Recently, a new category of methods has\nemerged, based on the analysis of the intermediate features of a trained model.\nThese methods can be divided into 2 groups: single-layer methods that consider\nthe feature map obtained at a fixed, carefully chosen layer, and multi-layer\nmethods that consider the ensemble of the feature maps generated by the model.\nWhile promising, a proper comparison of these algorithms is still lacking. In\nthis work, we compared various feature-based OOD detection methods on a large\nspectra of OOD (20 types), representing approximately 7800 3D MRIs. Our\nexperiments shed the light on two phenomenons. First, multi-layer methods\nconsistently outperform single-layer approaches, which tend to have\ninconsistent behaviour depending on the type of anomaly. Second, the OOD\ndetection performance highly depends on the architecture of the underlying\nneural network.\n","authors":["Benjamin Lambert","Florence Forbes","Senan Doyle","Michel Dojat"],"pdf_url":"https://arxiv.org/pdf/2307.15647v1.pdf","comment":"Accepted for presentation at the Workshop on Uncertainty for Safe\n  Utilization of Machine Learning in Medical Imaging (UNSURE) at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.15645v1","updated":"2023-07-28T16:04:34Z","published":"2023-07-28T16:04:34Z","title":"Scale-aware Test-time Click Adaptation for Pulmonary Nodule and Mass\n  Segmentation","summary":"  Pulmonary nodules and masses are crucial imaging features in lung cancer\nscreening that require careful management in clinical diagnosis. Despite the\nsuccess of deep learning-based medical image segmentation, the robust\nperformance on various sizes of lesions of nodule and mass is still\nchallenging. In this paper, we propose a multi-scale neural network with\nscale-aware test-time adaptation to address this challenge. Specifically, we\nintroduce an adaptive Scale-aware Test-time Click Adaptation method based on\neffortlessly obtainable lesion clicks as test-time cues to enhance segmentation\nperformance, particularly for large lesions. The proposed method can be\nseamlessly integrated into existing networks. Extensive experiments on both\nopen-source and in-house datasets consistently demonstrate the effectiveness of\nthe proposed method over some CNN and Transformer-based segmentation methods.\nOur code is available at https://github.com/SplinterLi/SaTTCA\n","authors":["Zhihao Li","Jiancheng Yang","Yongchao Xu","Li Zhang","Wenhui Dong","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2307.15645v1.pdf","comment":"11 pages, 3 figures, MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.15644v1","updated":"2023-07-28T16:03:28Z","published":"2023-07-28T16:03:28Z","title":"Scaling Data Generation in Vision-and-Language Navigation","summary":"  Recent research in language-guided visual navigation has demonstrated a\nsignificant demand for the diversity of traversable environments and the\nquantity of supervision for training generalizable agents. To tackle the common\ndata scarcity issue in existing vision-and-language navigation datasets, we\npropose an effective paradigm for generating large-scale data for learning,\nwhich applies 1200+ photo-realistic environments from HM3D and Gibson datasets\nand synthesizes 4.9 million instruction trajectory pairs using fully-accessible\nresources on the web. Importantly, we investigate the influence of each\ncomponent in this paradigm on the agent's performance and study how to\nadequately apply the augmented data to pre-train and fine-tune an agent. Thanks\nto our large-scale dataset, the performance of an existing agent can be pushed\nup (+11% absolute with regard to previous SoTA) to a significantly new best of\n80% single-run success rate on the R2R test split by simple imitation learning.\nThe long-lasting generalization gap between navigating in seen and unseen\nenvironments is also reduced to less than 1% (versus 8% in the previous best\nmethod). Moreover, our paradigm also facilitates different models to achieve\nnew state-of-the-art navigation results on CVDN, REVERIE, and R2R in continuous\nenvironments.\n","authors":["Zun Wang","Jialu Li","Yicong Hong","Yi Wang","Qi Wu","Mohit Bansal","Stephen Gould","Hao Tan","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2307.15644v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.15640v1","updated":"2023-07-28T16:00:21Z","published":"2023-07-28T16:00:21Z","title":"CLIP Brings Better Features to Visual Aesthetics Learners","summary":"  The success of pre-training approaches on a variety of downstream tasks has\nrevitalized the field of computer vision. Image aesthetics assessment (IAA) is\none of the ideal application scenarios for such methods due to subjective and\nexpensive labeling procedure. In this work, an unified and flexible two-phase\n\\textbf{C}LIP-based \\textbf{S}emi-supervised \\textbf{K}nowledge\n\\textbf{D}istillation paradigm is proposed, namely \\textbf{\\textit{CSKD}}.\nSpecifically, we first integrate and leverage a multi-source unlabeled dataset\nto align rich features between a given visual encoder and an off-the-shelf CLIP\nimage encoder via feature alignment loss. Notably, the given visual encoder is\nnot limited by size or structure and, once well-trained, it can seamlessly\nserve as a better visual aesthetic learner for both student and teacher. In the\nsecond phase, the unlabeled data is also utilized in semi-supervised IAA\nlearning to further boost student model performance when applied in\nlatency-sensitive production scenarios. By analyzing the attention distance and\nentropy before and after feature alignment, we notice an alleviation of feature\ncollapse issue, which in turn showcase the necessity of feature alignment\ninstead of training directly based on CLIP image encoder. Extensive experiments\nindicate the superiority of CSKD, which achieves state-of-the-art performance\non multiple widely used IAA benchmarks.\n","authors":["Liwu Xu","Jinjin Xu","Yuzhe Yang","Yijie Huang","Yanchun Xie","Yaqian Li"],"pdf_url":"https://arxiv.org/pdf/2307.15640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15638v1","updated":"2023-07-28T15:56:04Z","published":"2023-07-28T15:56:04Z","title":"TriadNet: Sampling-free predictive intervals for lesional volume in 3D\n  brain MR images","summary":"  The volume of a brain lesion (e.g. infarct or tumor) is a powerful indicator\nof patient prognosis and can be used to guide the therapeutic strategy.\nLesional volume estimation is usually performed by segmentation with deep\nconvolutional neural networks (CNN), currently the state-of-the-art approach.\nHowever, to date, few work has been done to equip volume segmentation tools\nwith adequate quantitative predictive intervals, which can hinder their\nusefulness and acceptation in clinical practice. In this work, we propose\nTriadNet, a segmentation approach relying on a multi-head CNN architecture,\nwhich provides both the lesion volumes and the associated predictive intervals\nsimultaneously, in less than a second. We demonstrate its superiority over\nother solutions on BraTS 2021, a large-scale MRI glioblastoma image database.\n","authors":["Benjamin Lambert","Florence Forbes","Senan Doyle","Michel Dojat"],"pdf_url":"https://arxiv.org/pdf/2307.15638v1.pdf","comment":"Accepted for presentation at the Workshop on Uncertainty for Safe\n  Utilization of Machine Learning in Medical Imaging (UNSURE) at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.07961v2","updated":"2023-07-28T15:38:19Z","published":"2023-07-16T06:42:46Z","title":"EmoSet: A Large-scale Visual Emotion Dataset with Rich Attributes","summary":"  Visual Emotion Analysis (VEA) aims at predicting people's emotional responses\nto visual stimuli. This is a promising, yet challenging, task in affective\ncomputing, which has drawn increasing attention in recent years. Most of the\nexisting work in this area focuses on feature design, while little attention\nhas been paid to dataset construction. In this work, we introduce EmoSet, the\nfirst large-scale visual emotion dataset annotated with rich attributes, which\nis superior to existing datasets in four aspects: scale, annotation richness,\ndiversity, and data balance. EmoSet comprises 3.3 million images in total, with\n118,102 of these images carefully labeled by human annotators, making it five\ntimes larger than the largest existing dataset. EmoSet includes images from\nsocial networks, as well as artistic images, and it is well balanced between\ndifferent emotion categories. Motivated by psychological studies, in addition\nto emotion category, each image is also annotated with a set of describable\nemotion attributes: brightness, colorfulness, scene type, object class, facial\nexpression, and human action, which can help understand visual emotions in a\nprecise and interpretable way. The relevance of these emotion attributes is\nvalidated by analyzing the correlations between them and visual emotion, as\nwell as by designing an attribute module to help visual emotion recognition. We\nbelieve EmoSet will bring some key insights and encourage further research in\nvisual emotion analysis and understanding. Project page:\nhttps://vcc.tech/EmoSet.\n","authors":["Jingyuan Yang","Qirui Huang","Tingting Ding","Dani Lischinski","Daniel Cohen-Or","Hui Huang"],"pdf_url":"https://arxiv.org/pdf/2307.07961v2.pdf","comment":"Accepted to ICCV2023, similar to the final version"},{"id":"http://arxiv.org/abs/2307.15615v1","updated":"2023-07-28T15:22:34Z","published":"2023-07-28T15:22:34Z","title":"A Survey on Deep Learning in Medical Image Registration: New\n  Technologies, Uncertainty, Evaluation Metrics, and Beyond","summary":"  Over the past decade, deep learning technologies have greatly advanced the\nfield of medical image registration. The initial developments, such as\nResNet-based and U-Net-based networks, laid the groundwork for deep\nlearning-driven image registration. Subsequent progress has been made in\nvarious aspects of deep learning-based registration, including similarity\nmeasures, deformation regularizations, and uncertainty estimation. These\nadvancements have not only enriched the field of deformable image registration\nbut have also facilitated its application in a wide range of tasks, including\natlas construction, multi-atlas segmentation, motion estimation, and 2D-3D\nregistration. In this paper, we present a comprehensive overview of the most\nrecent advancements in deep learning-based image registration. We begin with a\nconcise introduction to the core concepts of deep learning-based image\nregistration. Then, we delve into innovative network architectures, loss\nfunctions specific to registration, and methods for estimating registration\nuncertainty. Additionally, this paper explores appropriate evaluation metrics\nfor assessing the performance of deep learning models in registration tasks.\nFinally, we highlight the practical applications of these novel techniques in\nmedical imaging and discuss the future prospects of deep learning-based image\nregistration.\n","authors":["Junyu Chen","Yihao Liu","Shuwen Wei","Zhangxing Bian","Shalini Subramanian","Aaron Carass","Jerry L. Prince","Yong Du"],"pdf_url":"https://arxiv.org/pdf/2307.15615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16049v2","updated":"2023-07-28T15:13:23Z","published":"2023-05-25T13:31:37Z","title":"CN-Celeb-AV: A Multi-Genre Audio-Visual Dataset for Person Recognition","summary":"  Audio-visual person recognition (AVPR) has received extensive attention.\nHowever, most datasets used for AVPR research so far are collected in\nconstrained environments, and thus cannot reflect the true performance of AVPR\nsystems in real-world scenarios. To meet the request for research on AVPR in\nunconstrained conditions, this paper presents a multi-genre AVPR dataset\ncollected `in the wild', named CN-Celeb-AV. This dataset contains more than\n419k video segments from 1,136 persons from public media. In particular, we put\nmore emphasis on two real-world complexities: (1) data in multiple genres; (2)\nsegments with partial information. A comprehensive study was conducted to\ncompare CN-Celeb-AV with two popular public AVPR benchmark datasets, and the\nresults demonstrated that CN-Celeb-AV is more in line with real-world scenarios\nand can be regarded as a new benchmark dataset for AVPR research. The dataset\nalso involves a development set that can be used to boost the performance of\nAVPR systems in real-life situations. The dataset is free for researchers and\ncan be downloaded from http://cnceleb.org/.\n","authors":["Lantian Li","Xiaolou Li","Haoyu Jiang","Chen Chen","Ruihai Hou","Dong Wang"],"pdf_url":"https://arxiv.org/pdf/2305.16049v2.pdf","comment":"INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2307.15604v1","updated":"2023-07-28T15:04:22Z","published":"2023-07-28T15:04:22Z","title":"Integrated Digital Reconstruction of Welded Components: Supporting\n  Improved Fatigue Life Prediction","summary":"  In the design of offshore jacket foundations, fatigue life is crucial.\nPost-weld treatment has been proposed to enhance the fatigue performance of\nwelded joints, where particularly high-frequency mechanical impact (HFMI)\ntreatment has been shown to improve fatigue performance significantly.\nAutomated HFMI treatment has improved quality assurance and can lead to\ncost-effective design when combined with accurate fatigue life prediction.\nHowever, the finite element method (FEM), commonly used for predicting fatigue\nlife in complex or multi-axial joints, relies on a basic CAD depiction of the\nweld, failing to consider the actual weld geometry and defects. Including the\nactual weld geometry in the FE model improves fatigue life prediction and\npossible crack location prediction but requires a digital reconstruction of the\nweld. Current digital reconstruction methods are time-consuming or require\nspecialised scanning equipment and potential component relocation. The proposed\nframework instead uses an industrial manipulator combined with a line scanner\nto integrate digital reconstruction as part of the automated HFMI treatment\nsetup. This approach applies standard image processing, simple filtering\ntechniques, and non-linear optimisation for aligning and merging overlapping\nscans. A screened Poisson surface reconstruction finalises the 3D model to\ncreate a meshed surface. The outcome is a generic, cost-effective, flexible,\nand rapid method that enables generic digital reconstruction of welded parts,\naiding in component design, overall quality assurance, and documentation of the\nHFMI treatment.\n","authors":["Anders Faarbæk Mikkelstrup","Morten Kristiansen"],"pdf_url":"https://arxiv.org/pdf/2307.15604v1.pdf","comment":"6 pages, 7 figures, submitted to 2023 IEEE International Conference\n  on Imaging Systems and Techniques (IST2023)"},{"id":"http://arxiv.org/abs/2307.15588v1","updated":"2023-07-28T14:43:27Z","published":"2023-07-28T14:43:27Z","title":"OAFuser: Towards Omni-Aperture Fusion for Light Field Semantic\n  Segmentation of Road Scenes","summary":"  Light field cameras can provide rich angular and spatial information to\nenhance image semantic segmentation for scene understanding in the field of\nautonomous driving. However, the extensive angular information of light field\ncameras contains a large amount of redundant data, which is overwhelming for\nthe limited hardware resource of intelligent vehicles. Besides, inappropriate\ncompression leads to information corruption and data loss. To excavate\nrepresentative information, we propose an Omni-Aperture Fusion model (OAFuser),\nwhich leverages dense context from the central view and discovers the angular\ninformation from sub-aperture images to generate a semantically-consistent\nresult. To avoid feature loss during network propagation and simultaneously\nstreamline the redundant information from the light field camera, we present a\nsimple yet very effective Sub-Aperture Fusion Module (SAFM) to embed\nsub-aperture images into angular features without any additional memory cost.\nFurthermore, to address the mismatched spatial information across viewpoints,\nwe present Center Angular Rectification Module (CARM) realized feature\nresorting and prevent feature occlusion caused by asymmetric information. Our\nproposed OAFuser achieves state-of-the-art performance on the UrbanLF-Real and\n-Syn datasets and sets a new record of 84.93% in mIoU on the UrbanLF-Real\nExtended dataset, with a gain of +4.53%. The source code of OAFuser will be\nmade publicly available at https://github.com/FeiBryantkit/OAFuser.\n","authors":["Fei Teng","Jiaming Zhang","Kunyu Peng","Kailun Yang","Yaonan Wang","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2307.15588v1.pdf","comment":"The source code of OAFuser will be made publicly available at\n  https://github.com/FeiBryantkit/OAFuser"},{"id":"http://arxiv.org/abs/2305.11533v2","updated":"2023-07-28T14:29:12Z","published":"2023-05-19T08:58:09Z","title":"Embrace Limited and Imperfect Training Datasets: Opportunities and\n  Challenges in Plant Disease Recognition Using Deep Learning","summary":"  Recent advancements in deep learning have brought significant improvements to\nplant disease recognition. However, achieving satisfactory performance often\nrequires high-quality training datasets, which are challenging and expensive to\ncollect. Consequently, the practical application of current deep learning-based\nmethods in real-world scenarios is hindered by the scarcity of high-quality\ndatasets. In this paper, we argue that embracing poor datasets is viable and\naim to explicitly define the challenges associated with using these datasets.\nTo delve into this topic, we analyze the characteristics of high-quality\ndatasets, namely large-scale images and desired annotation, and contrast them\nwith the \\emph{limited} and \\emph{imperfect} nature of poor datasets.\nChallenges arise when the training datasets deviate from these characteristics.\nTo provide a comprehensive understanding, we propose a novel and informative\ntaxonomy that categorizes these challenges. Furthermore, we offer a brief\noverview of existing studies and approaches that address these challenges. We\nbelieve that our paper sheds light on the importance of embracing poor\ndatasets, enhances the understanding of the associated challenges, and\ncontributes to the ambitious objective of deploying deep learning in real-world\napplications. To facilitate the progress, we finally describe several\noutstanding questions and point out potential future directions. Although our\nprimary focus is on plant disease recognition, we emphasize that the principles\nof embracing and analyzing poor datasets are applicable to a wider range of\ndomains, including agriculture.\n","authors":["Mingle Xu","Hyongsuk Kim","Jucheng Yang","Alvaro Fuentes","Yao Meng","Sook Yoon","Taehyun Kim","Dong Sun Park"],"pdf_url":"https://arxiv.org/pdf/2305.11533v2.pdf","comment":"revision v1 in perspetive style"},{"id":"http://arxiv.org/abs/2307.15569v1","updated":"2023-07-28T14:04:54Z","published":"2023-07-28T14:04:54Z","title":"Point Clouds Are Specialized Images: A Knowledge Transfer Approach for\n  3D Understanding","summary":"  Self-supervised representation learning (SSRL) has gained increasing\nattention in point cloud understanding, in addressing the challenges posed by\n3D data scarcity and high annotation costs. This paper presents PCExpert, a\nnovel SSRL approach that reinterprets point clouds as \"specialized images\".\nThis conceptual shift allows PCExpert to leverage knowledge derived from\nlarge-scale image modality in a more direct and deeper manner, via extensively\nsharing the parameters with a pre-trained image encoder in a multi-way\nTransformer architecture. The parameter sharing strategy, combined with a novel\npretext task for pre-training, i.e., transformation estimation, empowers\nPCExpert to outperform the state of the arts in a variety of tasks, with a\nremarkable reduction in the number of trainable parameters. Notably, PCExpert's\nperformance under LINEAR fine-tuning (e.g., yielding a 90.02% overall accuracy\non ScanObjectNN) has already approached the results obtained with FULL model\nfine-tuning (92.66%), demonstrating its effective and robust representation\ncapability.\n","authors":["Jiachen Kang","Wenjing Jia","Xiangjian He","Kin Man Lam"],"pdf_url":"https://arxiv.org/pdf/2307.15569v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15567v1","updated":"2023-07-28T14:04:06Z","published":"2023-07-28T14:04:06Z","title":"Panoptic Scene Graph Generation with Semantics-prototype Learning","summary":"  Panoptic Scene Graph Generation (PSG) parses objects and predicts their\nrelationships (predicate) to connect human language and visual scenes. However,\ndifferent language preferences of annotators and semantic overlaps between\npredicates lead to biased predicate annotations in the dataset, i.e. different\npredicates for same object pairs. Biased predicate annotations make PSG models\nstruggle in constructing a clear decision plane among predicates, which greatly\nhinders the real application of PSG models. To address the intrinsic bias\nabove, we propose a novel framework named ADTrans to adaptively transfer biased\npredicate annotations to informative and unified ones. To promise consistency\nand accuracy during the transfer process, we propose to measure the invariance\nof representations in each predicate class, and learn unbiased prototypes of\npredicates with different intensities. Meanwhile, we continuously measure the\ndistribution changes between each presentation and its prototype, and\nconstantly screen potential biased data. Finally, with the unbiased\npredicate-prototype representation embedding space, biased annotations are\neasily identified. Experiments show that ADTrans significantly improves the\nperformance of benchmark models, achieving a new state-of-the-art performance,\nand shows great generalization and effectiveness on multiple datasets.\n","authors":["Li Li","Wei Ji","Yiming Wu","Mengze Li","You Qin","Lina Wei","Roger Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2307.15567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.14272v2","updated":"2023-07-28T13:18:01Z","published":"2022-09-28T17:36:47Z","title":"Towards Multimodal Prediction of Spontaneous Humour: A Novel Dataset and\n  First Results","summary":"  Humour is a substantial element of human affect and cognition. Its automatic\nunderstanding can facilitate a more naturalistic human-device interaction and\nthe humanisation of artificial intelligence. Current methods of humour\ndetection are solely based on staged data making them inadequate for\n'real-world' applications. We address this deficiency by introducing the novel\nPassau-Spontaneous Football Coach Humour (Passau-SFCH) dataset, comprising of\nabout 11 hours of recordings. The Passau-SFCH dataset is annotated for the\npresence of humour and its dimensions (sentiment and direction) as proposed in\nMartin's Humor Style Questionnaire. We conduct a series of experiments,\nemploying pretrained Transformers, convolutional neural networks, and\nexpert-designed features. The performance of each modality (text, audio, video)\nfor spontaneous humour recognition is analysed and their complementarity is\ninvestigated. Our findings suggest that for the automatic analysis of humour\nand its sentiment, facial expressions are most promising, while humour\ndirection can be best modelled via text-based features. The results reveal\nconsiderable differences among various subjects, highlighting the individuality\nof humour usage and style. Further, we observe that a decision-level fusion\nyields the best recognition result. Finally, we make our code publicly\navailable at https://www.github.com/EIHW/passau-sfch. The Passau-SFCH dataset\nis available upon request.\n","authors":["Lukas Christ","Shahin Amiriparian","Alexander Kathan","Niklas Müller","Andreas König","Björn W. Schuller"],"pdf_url":"https://arxiv.org/pdf/2209.14272v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible (Major Revision)"},{"id":"http://arxiv.org/abs/2212.03235v3","updated":"2023-07-28T13:10:16Z","published":"2022-12-06T18:57:59Z","title":"Complex-valued Retrievals From Noisy Images Using Diffusion Models","summary":"  In diverse microscopy modalities, sensors measure only real-valued\nintensities. Additionally, the sensor readouts are affected by\nPoissonian-distributed photon noise. Traditional restoration algorithms\ntypically aim to minimize the mean squared error (MSE) between the original and\nrecovered images. This often leads to blurry outcomes with poor perceptual\nquality. Recently, deep diffusion models (DDMs) have proven to be highly\ncapable of sampling images from the a-posteriori probability of the sought\nvariables, resulting in visually pleasing high-quality images. These models\nhave mostly been suggested for real-valued images suffering from Gaussian\nnoise. In this study, we generalize annealed Langevin Dynamics, a type of DDM,\nto tackle the fundamental challenges in optical imaging of complex-valued\nobjects (and real images) affected by Poisson noise. We apply our algorithm to\nvarious optical scenarios, such as Fourier Ptychography, Phase Retrieval, and\nPoisson denoising. Our algorithm is evaluated on simulations and biological\nempirical data.\n","authors":["Nadav Torem","Roi Ronen","Yoav Y. Schechner","Michael Elad"],"pdf_url":"https://arxiv.org/pdf/2212.03235v3.pdf","comment":"11 pages, 7figures"},{"id":"http://arxiv.org/abs/2307.15539v1","updated":"2023-07-28T13:07:42Z","published":"2023-07-28T13:07:42Z","title":"Backdoor Defense with Non-Adversarial Backdoor","summary":"  Deep neural networks (DNNs) are vulnerable to backdoor attack, which does not\naffect the network's performance on clean data but would manipulate the network\nbehavior once a trigger pattern is added. Existing defense methods have greatly\nreduced attack success rate, but their prediction accuracy on clean data still\nlags behind a clean model by a large margin. Inspired by the stealthiness and\neffectiveness of backdoor attack, we propose a simple but highly effective\ndefense framework which injects non-adversarial backdoors targeting poisoned\nsamples. Following the general steps in backdoor attack, we detect a small set\nof suspected samples and then apply a poisoning strategy to them. The\nnon-adversarial backdoor, once triggered, suppresses the attacker's backdoor on\npoisoned data, but has limited influence on clean data. The defense can be\ncarried out during data preprocessing, without any modification to the standard\nend-to-end training pipeline. We conduct extensive experiments on multiple\nbenchmarks with different architectures and representative attacks. Results\ndemonstrate that our method achieves state-of-the-art defense effectiveness\nwith by far the lowest performance drop on clean data. Considering the\nsurprising defense ability displayed by our framework, we call for more\nattention to utilizing backdoor for backdoor defense. Code is available at\nhttps://github.com/damianliumin/non-adversarial_backdoor.\n","authors":["Min Liu","Alberto Sangiovanni-Vincentelli","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.15539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12717v2","updated":"2023-07-28T12:54:24Z","published":"2023-07-24T11:58:58Z","title":"Dense Transformer based Enhanced Coding Network for Unsupervised Metal\n  Artifact Reduction","summary":"  CT images corrupted by metal artifacts have serious negative effects on\nclinical diagnosis. Considering the difficulty of collecting paired data with\nground truth in clinical settings, unsupervised methods for metal artifact\nreduction are of high interest. However, it is difficult for previous\nunsupervised methods to retain structural information from CT images while\nhandling the non-local characteristics of metal artifacts. To address these\nchallenges, we proposed a novel Dense Transformer based Enhanced Coding Network\n(DTEC-Net) for unsupervised metal artifact reduction. Specifically, we\nintroduce a Hierarchical Disentangling Encoder, supported by the high-order\ndense process, and transformer to obtain densely encoded sequences with\nlong-range correspondence. Then, we present a second-order disentanglement\nmethod to improve the dense sequence's decoding process. Extensive experiments\nand model discussions illustrate DTEC-Net's effectiveness, which outperforms\nthe previous state-of-the-art methods on a benchmark dataset, and greatly\nreduces metal artifacts while restoring richer texture details.\n","authors":["Wangduo Xie","Matthew B. Blaschko"],"pdf_url":"https://arxiv.org/pdf/2307.12717v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15524v1","updated":"2023-07-28T12:30:41Z","published":"2023-07-28T12:30:41Z","title":"Few-shot Image Classification based on Gradual Machine Learning","summary":"  Few-shot image classification aims to accurately classify unlabeled images\nusing only a few labeled samples. The state-of-the-art solutions are built by\ndeep learning, which focuses on designing increasingly complex deep backbones.\nUnfortunately, the task remains very challenging due to the difficulty of\ntransferring the knowledge learned in training classes to new ones. In this\npaper, we propose a novel approach based on the non-i.i.d paradigm of gradual\nmachine learning (GML). It begins with only a few labeled observations, and\nthen gradually labels target images in the increasing order of hardness by\niterative factor inference in a factor graph. Specifically, our proposed\nsolution extracts indicative feature representations by deep backbones, and\nthen constructs both unary and binary factors based on the extracted features\nto facilitate gradual learning. The unary factors are constructed based on\nclass center distance in an embedding space, while the binary factors are\nconstructed based on k-nearest neighborhood. We have empirically validated the\nperformance of the proposed approach on benchmark datasets by a comparative\nstudy. Our extensive experiments demonstrate that the proposed approach can\nimprove the SOTA performance by 1-5% in terms of accuracy. More notably, it is\nmore robust than the existing deep models in that its performance can\nconsistently improve as the size of query set increases while the performance\nof deep models remains essentially flat or even becomes worse.\n","authors":["Na Chen","Xianming Kuang","Feiyu Liu","Kehao Wang","Qun Chen"],"pdf_url":"https://arxiv.org/pdf/2307.15524v1.pdf","comment":"17 pages,6 figures,5 tables, 55 conferences"},{"id":"http://arxiv.org/abs/2212.05922v2","updated":"2023-07-28T12:22:59Z","published":"2022-12-09T17:34:53Z","title":"Audiovisual Masked Autoencoders","summary":"  Can we leverage the audiovisual information already present in video to\nimprove self-supervised representation learning? To answer this question, we\nstudy various pretraining architectures and objectives within the masked\nautoencoding framework, motivated by the success of similar methods in natural\nlanguage and image understanding. We show that we can achieve significant\nimprovements on audiovisual downstream classification tasks, surpassing the\nstate-of-the-art on VGGSound and AudioSet. Furthermore, we can leverage our\naudiovisual pretraining scheme for multiple unimodal downstream tasks using a\nsingle audiovisual pretrained model. We additionally demonstrate the\ntransferability of our representations, achieving state-of-the-art audiovisual\nresults on Epic Kitchens without pretraining specifically for this dataset.\n","authors":["Mariana-Iuliana Georgescu","Eduardo Fonseca","Radu Tudor Ionescu","Mario Lucic","Cordelia Schmid","Anurag Arnab"],"pdf_url":"https://arxiv.org/pdf/2212.05922v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.15516v1","updated":"2023-07-28T12:17:01Z","published":"2023-07-28T12:17:01Z","title":"YOLOv8 for Defect Inspection of Hexagonal Directed Self-Assembly\n  Patterns: A Data-Centric Approach","summary":"  Shrinking pattern dimensions leads to an increased variety of defect types in\nsemiconductor devices. This has spurred innovation in patterning approaches\nsuch as Directed self-assembly (DSA) for which no traditional, automatic defect\ninspection software exists. Machine Learning-based SEM image analysis has\nbecome an increasingly popular research topic for defect inspection with\nsupervised ML models often showing the best performance. However, little\nresearch has been done on obtaining a dataset with high-quality labels for\nthese supervised models. In this work, we propose a method for obtaining\ncoherent and complete labels for a dataset of hexagonal contact hole DSA\npatterns while requiring minimal quality control effort from a DSA expert. We\nshow that YOLOv8, a state-of-the-art neural network, achieves defect detection\nprecisions of more than 0.9 mAP on our final dataset which best reflects DSA\nexpert defect labeling expectations. We discuss the strengths and limitations\nof our proposed labeling approach and suggest directions for future work in\ndata-centric ML-based defect inspection.\n","authors":["Enrique Dehaerne","Bappaditya Dey","Hossein Esfandiar","Lander Verstraete","Hyo Seon Suh","Sandip Halder","Stefan De Gendt"],"pdf_url":"https://arxiv.org/pdf/2307.15516v1.pdf","comment":"8 pages, 10 figures, accepted for the 38th EMLC Conference 2023"},{"id":"http://arxiv.org/abs/2307.15514v1","updated":"2023-07-28T12:16:31Z","published":"2023-07-28T12:16:31Z","title":"Revisiting Fully Convolutional Geometric Features for Object 6D Pose\n  Estimation","summary":"  Recent works on 6D object pose estimation focus on learning keypoint\ncorrespondences between images and object models, and then determine the object\npose through RANSAC-based algorithms or by directly regressing the pose with\nend-to-end optimisations. We argue that learning point-level discriminative\nfeatures is overlooked in the literature. To this end, we revisit Fully\nConvolutional Geometric Features (FCGF) and tailor it for object 6D pose\nestimation to achieve state-of-the-art performance. FCGF employs sparse\nconvolutions and learns point-level features using a fully-convolutional\nnetwork by optimising a hardest contrastive loss. We can outperform recent\ncompetitors on popular benchmarks by adopting key modifications to the loss and\nto the input data representations, by carefully tuning the training strategies,\nand by employing data augmentations suitable for the underlying problem. We\ncarry out a thorough ablation to study the contribution of each modification.\n","authors":["Jaime Corsetti","Davide Boscaini","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2307.15514v1.pdf","comment":"17 pages. Preprint, currently under review"},{"id":"http://arxiv.org/abs/2307.15506v1","updated":"2023-07-28T12:03:55Z","published":"2023-07-28T12:03:55Z","title":"Improving Image Quality of Sparse-view Lung Cancer CT Images with a\n  Convolutional Neural Network","summary":"  Purpose: To improve the image quality of sparse-view computed tomography (CT)\nimages with a U-Net for lung cancer detection and to determine the best\ntrade-off between number of views, image quality, and diagnostic confidence.\n  Methods: CT images from 41 subjects (34 with lung cancer, seven healthy) were\nretrospectively selected (01.2016-12.2018) and forward projected onto 2048-view\nsinograms. Six corresponding sparse-view CT data subsets at varying levels of\nundersampling were reconstructed from sinograms using filtered backprojection\nwith 16, 32, 64, 128, 256, and 512 views, respectively. A dual-frame U-Net was\ntrained and evaluated for each subsampling level on 8,658 images from 22\ndiseased subjects. A representative image per scan was selected from 19\nsubjects (12 diseased, seven healthy) for a single-blinded reader study. The\nselected slices, for all levels of subsampling, with and without\npost-processing by the U-Net model, were presented to three readers. Image\nquality and diagnostic confidence were ranked using pre-defined scales.\nSubjective nodule segmentation was evaluated utilizing sensitivity (Se) and\nDice Similarity Coefficient (DSC) with 95% confidence intervals (CI).\n  Results: The 64-projection sparse-view images resulted in Se = 0.89 and DSC =\n0.81 [0.75,0.86] while their counterparts, post-processed with the U-Net, had\nimproved metrics (Se = 0.94, DSC = 0.85 [0.82,0.87]). Fewer views lead to\ninsufficient quality for diagnostic purposes. For increased views, no\nsubstantial discrepancies were noted between the sparse-view and post-processed\nimages.\n  Conclusion: Projection views can be reduced from 2048 to 64 while maintaining\nimage quality and the confidence of the radiologists on a satisfactory level.\n","authors":["Annika Ries","Tina Dorosti","Johannes Thalhammer","Daniel Sasse","Andreas Sauter","Felix Meurer","Ashley Benne","Franz Pfeiffer","Daniela Pfeiffer"],"pdf_url":"https://arxiv.org/pdf/2307.15506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07189v2","updated":"2023-07-28T11:49:02Z","published":"2023-03-13T15:30:28Z","title":"Optimizing Convolutional Neural Networks for Chronic Obstructive\n  Pulmonary Disease Detection in Clinical Computed Tomography Imaging","summary":"  Purpose: To optimize the binary detection of Chronic Obstructive Pulmonary\nDisease (COPD) based on emphysema presence in the lung with convolutional\nneural networks (CNN) by exploring manually adjusted versus automated\nwindow-setting optimization (WSO) on computed tomography (CT) images.\n  Methods: 7,194 CT images (3,597 with COPD; 3,597 healthy controls) from 78\nsubjects (43 with COPD; 35 healthy controls) were selected retrospectively\n(10.2018-12.2019) and preprocessed. For each image, intensity values were\nmanually clipped to the emphysema window setting and a baseline 'full-range'\nwindow setting. Class-balanced train, validation, and test sets contained\n3,392, 1,114, and 2,688 images. The network backbone was optimized by comparing\nvarious CNN architectures. Furthermore, automated WSO was implemented by adding\na customized layer to the model. The image-level area under the Receiver\nOperating Characteristics curve (AUC) [lower, upper limit 95% confidence] and\nP-values calculated from one-sided Mann-Whitney U-test were utilized to compare\nmodel variations.\n  Results: Repeated inference (n=7) on the test set showed that the DenseNet\nwas the most efficient backbone and achieved a mean AUC of 0.80 [0.76, 0.85]\nwithout WSO. Comparably, with input images manually adjusted to the emphysema\nwindow, the DenseNet model predicted COPD with a mean AUC of 0.86 [0.82, 0.89]\n(P=0.03). By adding a customized WSO layer to the DenseNet, an optimal window\nin the proximity of the emphysema window setting was learned automatically, and\na mean AUC of 0.82 [0.78, 0.86] was achieved.\n  Conclusion: Detection of COPD with DenseNet models was improved by WSO of CT\ndata to the emphysema window setting range.\n","authors":["Tina Dorosti","Manuel Schultheiss","Felix Hofmann","Johannes Thalhammer","Luisa Kirchner","Theresa Urban","Franz Pfeiffer","Florian Schaff","Tobias Lasser","Daniela Pfeiffer"],"pdf_url":"https://arxiv.org/pdf/2303.07189v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15480v1","updated":"2023-07-28T11:08:01Z","published":"2023-07-28T11:08:01Z","title":"Non-invasive Diabetes Detection using Gabor Filter: A Comparative\n  Analysis of Different Cameras","summary":"  This paper compares and explores the performance of both mobile device camera\nand laptop camera as convenient tool for capturing images for non-invasive\ndetection of Diabetes Mellitus (DM) using facial block texture features.\nParticipants within age bracket 20 to 79 years old were chosen for the dataset.\n12mp and 7mp mobile cameras, and a laptop camera were used to take the photo\nunder normal lighting condition. Extracted facial blocks were classified using\nk-Nearest Neighbors (k-NN) and Support Vector Machine (SVM). 100 images were\ncaptured, preprocessed, filtered using Gabor, and iterated. Performance of the\nsystem was measured in terms of accuracy, specificity, and sensitivity. Best\nperformance of 96.7% accuracy, 100% sensitivity, and 93% specificity were\nachieved from 12mp back camera using SVM with 100 images.\n","authors":["Christina A. Garcia","Patricia Angela R. Abu","Rosula SJ. Reyes"],"pdf_url":"https://arxiv.org/pdf/2307.15480v1.pdf","comment":"11 pages, 5 figures, 3 tables, conference"},{"id":"http://arxiv.org/abs/2307.15478v1","updated":"2023-07-28T11:07:34Z","published":"2023-07-28T11:07:34Z","title":"Local and Global Information in Obstacle Detection on Railway Tracks","summary":"  Reliable obstacle detection on railways could help prevent collisions that\nresult in injuries and potentially damage or derail the train. Unfortunately,\ngeneric object detectors do not have enough classes to account for all possible\nscenarios, and datasets featuring objects on railways are challenging to\nobtain. We propose utilizing a shallow network to learn railway segmentation\nfrom normal railway images. The limited receptive field of the network prevents\noverconfident predictions and allows the network to focus on the locally very\ndistinct and repetitive patterns of the railway environment. Additionally, we\nexplore the controlled inclusion of global information by learning to\nhallucinate obstacle-free images. We evaluate our method on a custom dataset\nfeaturing railway images with artificially augmented obstacles. Our proposed\nmethod outperforms other learning-based baseline methods.\n","authors":["Matthias Brucker","Andrei Cramariuc","Cornelius von Einem","Roland Siegwart","Cesar Cadena"],"pdf_url":"https://arxiv.org/pdf/2307.15478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08231v3","updated":"2023-07-28T10:51:37Z","published":"2023-02-16T11:28:30Z","title":"3M3D: Multi-view, Multi-path, Multi-representation for 3D Object\n  Detection","summary":"  3D visual perception tasks based on multi-camera images are essential for\nautonomous driving systems. Latest work in this field performs 3D object\ndetection by leveraging multi-view images as an input and iteratively enhancing\nobject queries (object proposals) by cross-attending multi-view features.\nHowever, individual backbone features are not updated with multi-view features\nand it stays as a mere collection of the output of the single-image backbone\nnetwork. Therefore we propose 3M3D: A Multi-view, Multi-path,\nMulti-representation for 3D Object Detection where we update both multi-view\nfeatures and query features to enhance the representation of the scene in both\nfine panoramic view and coarse global view. Firstly, we update multi-view\nfeatures by multi-view axis self-attention. It will incorporate panoramic\ninformation in the multi-view features and enhance understanding of the global\nscene. Secondly, we update multi-view features by self-attention of the ROI\n(Region of Interest) windows which encodes local finer details in the features.\nIt will help exchange the information not only along the multi-view axis but\nalso along the other spatial dimension. Lastly, we leverage the fact of\nmulti-representation of queries in different domains to further boost the\nperformance. Here we use sparse floating queries along with dense BEV (Bird's\nEye View) queries, which are later post-processed to filter duplicate\ndetections. Moreover, we show performance improvements on nuScenes benchmark\ndataset on top of our baselines.\n","authors":["Jongwoo Park","Apoorv Singh","Varun Bankiti"],"pdf_url":"https://arxiv.org/pdf/2302.08231v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12534v2","updated":"2023-07-28T10:45:50Z","published":"2023-07-24T05:43:34Z","title":"Towards Generalizable Deepfake Detection by Primary Region\n  Regularization","summary":"  The existing deepfake detection methods have reached a bottleneck in\ngeneralizing to unseen forgeries and manipulation approaches. Based on the\nobservation that the deepfake detectors exhibit a preference for overfitting\nthe specific primary regions in input, this paper enhances the generalization\ncapability from a novel regularization perspective. This can be simply achieved\nby augmenting the images through primary region removal, thereby preventing the\ndetector from over-relying on data bias. Our method consists of two stages,\nnamely the static localization for primary region maps, as well as the dynamic\nexploitation of primary region masks. The proposed method can be seamlessly\nintegrated into different backbones without affecting their inference\nefficiency. We conduct extensive experiments over three widely used deepfake\ndatasets - DFDC, DF-1.0, and Celeb-DF with five backbones. Our method\ndemonstrates an average performance improvement of 6% across different\nbackbones and performs competitively with several state-of-the-art baselines.\n","authors":["Harry Cheng","Yangyang Guo","Tianyi Wang","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2307.12534v2.pdf","comment":"12 pages. v2 corrected one minor citation error. Code and Dataset:\n  https://github.com/xaCheng1996/PRLE"},{"id":"http://arxiv.org/abs/2307.11702v2","updated":"2023-07-28T10:36:58Z","published":"2023-07-21T16:56:36Z","title":"SACReg: Scene-Agnostic Coordinate Regression for Visual Localization","summary":"  Scene coordinates regression (SCR), i.e., predicting 3D coordinates for every\npixel of a given image, has recently shown promising potential. However,\nexisting methods remain mostly scene-specific or limited to small scenes and\nthus hardly scale to realistic datasets. In this paper, we propose a new\nparadigm where a single generic SCR model is trained once to be then deployed\nto new test scenes, regardless of their scale and without further finetuning.\nFor a given query image, it collects inputs from off-the-shelf image retrieval\ntechniques and Structure-from-Motion databases: a list of relevant database\nimages with sparse pointwise 2D-3D annotations. The model is based on the\ntransformer architecture and can take a variable number of images and sparse\n2D-3D annotations as input. It is trained on a few diverse datasets and\nsignificantly outperforms other scene regression approaches on several\nbenchmarks, including scene-specific models, for visual localization. In\nparticular, we set a new state of the art on the Cambridge localization\nbenchmark, even outperforming feature-matching-based approaches.\n","authors":["Jerome Revaud","Yohann Cabon","Romain Brégier","JongMin Lee","Philippe Weinzaepfel"],"pdf_url":"https://arxiv.org/pdf/2307.11702v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15461v1","updated":"2023-07-28T10:27:28Z","published":"2023-07-28T10:27:28Z","title":"Defocus Blur Synthesis and Deblurring via Interpolation and\n  Extrapolation in Latent Space","summary":"  Though modern microscopes have an autofocusing system to ensure optimal\nfocus, out-of-focus images can still occur when cells within the medium are not\nall in the same focal plane, affecting the image quality for medical diagnosis\nand analysis of diseases. We propose a method that can deblur images as well as\nsynthesize defocus blur. We train autoencoders with implicit and explicit\nregularization techniques to enforce linearity relations among the\nrepresentations of different blur levels in the latent space. This allows for\nthe exploration of different blur levels of an object by linearly\ninterpolating/extrapolating the latent representations of images taken at\ndifferent focal planes. Compared to existing works, we use a simple\narchitecture to synthesize images with flexible blur levels, leveraging the\nlinear latent space. Our regularized autoencoders can effectively mimic blur\nand deblur, increasing data variety as a data augmentation technique and\nimproving the quality of microscopic images, which would be beneficial for\nfurther processing and analysis.\n","authors":["Ioana Mazilu","Shunxin Wang","Sven Dummer","Raymond Veldhuis","Christoph Brune","Nicola Strisciuglio"],"pdf_url":"https://arxiv.org/pdf/2307.15461v1.pdf","comment":"Accepted at CAIP2023"},{"id":"http://arxiv.org/abs/2307.15460v1","updated":"2023-07-28T10:26:28Z","published":"2023-07-28T10:26:28Z","title":"Cross-Modal Concept Learning and Inference for Vision-Language Models","summary":"  Large-scale pre-trained Vision-Language Models (VLMs), such as CLIP,\nestablish the correlation between texts and images, achieving remarkable\nsuccess on various downstream tasks with fine-tuning. In existing fine-tuning\nmethods, the class-specific text description is matched against the whole\nimage. We recognize that this whole image matching is not effective since\nimages from the same class often contain a set of different semantic objects,\nand an object further consists of a set of semantic parts or concepts.\nIndividual semantic parts or concepts may appear in image samples from\ndifferent classes. To address this issue, in this paper, we develop a new\nmethod called cross-model concept learning and inference (CCLI). Using the\npowerful text-image correlation capability of CLIP, our method automatically\nlearns a large set of distinctive visual concepts from images using a set of\nsemantic text concepts. Based on these visual concepts, we construct a\ndiscriminative representation of images and learn a concept inference network\nto perform downstream image classification tasks, such as few-shot learning and\ndomain generalization. Extensive experimental results demonstrate that our CCLI\nmethod is able to improve the performance upon the current state-of-the-art\nmethods by large margins, for example, by up to 8.0% improvement on few-shot\nlearning and by up to 1.3% for domain generalization.\n","authors":["Yi Zhang","Ce Zhang","Yushun Tang","Zhihai He"],"pdf_url":"https://arxiv.org/pdf/2307.15460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08555v2","updated":"2023-07-28T10:18:40Z","published":"2023-01-19T11:02:44Z","title":"Hybrid Open-set Segmentation with Synthetic Negative Data","summary":"  Open-set segmentation is often conceived by complementing closed-set\nclassification with anomaly detection. Existing dense anomaly detectors operate\neither through generative modelling of regular training data or by\ndiscriminating with respect to negative training data. These two approaches\noptimize different objectives and therefore exhibit different failure modes.\nConsequently, we propose the first dense hybrid anomaly score that fuses\ngenerative and discriminative cues. The proposed score can be efficiently\nimplemented by upgrading any semantic segmentation model with dense estimates\nof data likelihood and dataset posterior. Our design is a remarkably good fit\nfor efficient inference on large images due to negligible computational\noverhead over the closed-set baseline. The resulting dense hybrid open-set\nmodels require negative training images that can be sampled from an auxiliary\nnegative dataset, from a jointly trained generative model, or from a mixture of\nboth sources. We evaluate our contributions on benchmarks for dense anomaly\ndetection and open-set segmentation. The experiments reveal strong open-set\nperformance in spite of negligible computational overhead.\n","authors":["Matej Grcić","Siniša Šegvić"],"pdf_url":"https://arxiv.org/pdf/2301.08555v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14680v2","updated":"2023-07-28T10:11:19Z","published":"2023-06-26T13:23:52Z","title":"A Conditional Flow Variational Autoencoder for Controllable Synthesis of\n  Virtual Populations of Anatomy","summary":"  The generation of virtual populations (VPs) of anatomy is essential for\nconducting in silico trials of medical devices. Typically, the generated VP\nshould capture sufficient variability while remaining plausible and should\nreflect the specific characteristics and demographics of the patients observed\nin real populations. In several applications, it is desirable to synthesise\nvirtual populations in a \\textit{controlled} manner, where relevant covariates\nare used to conditionally synthesise virtual populations that fit a specific\ntarget population/characteristics. We propose to equip a conditional\nvariational autoencoder (cVAE) with normalising flows to boost the flexibility\nand complexity of the approximate posterior learnt, leading to enhanced\nflexibility for controllable synthesis of VPs of anatomical structures. We\ndemonstrate the performance of our conditional flow VAE using a data set of\ncardiac left ventricles acquired from 2360 patients, with associated\ndemographic information and clinical measurements (used as\ncovariates/conditional information). The results obtained indicate the\nsuperiority of the proposed method for conditional synthesis of virtual\npopulations of cardiac left ventricles relative to a cVAE. Conditional\nsynthesis performance was evaluated in terms of generalisation and specificity\nerrors and in terms of the ability to preserve clinically relevant biomarkers\nin synthesised VPs, that is, the left ventricular blood pool and myocardial\nvolume, relative to the real observed population.\n","authors":["Haoran Dou","Nishant Ravikumar","Alejandro F. Frangi"],"pdf_url":"https://arxiv.org/pdf/2306.14680v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.15444v1","updated":"2023-07-28T09:52:20Z","published":"2023-07-28T09:52:20Z","title":"ERCPMP: An Endoscopic Image and Video Dataset for Colorectal Polyps\n  Morphology and Pathology","summary":"  In the recent years, artificial intelligence (AI) and its leading subtypes,\nmachine learning (ML) and deep learning (DL) and their applications are\nspreading very fast in various aspects such as medicine. Today the most\nimportant challenge of developing accurate algorithms for medical prediction,\ndetection, diagnosis, treatment and prognosis is data. ERCPMP is an Endoscopic\nImage and Video Dataset for Recognition of Colorectal Polyps Morphology and\nPathology. This dataset contains demographic, morphological and pathological\ndata, endoscopic images and videos of 191 patients with colorectal polyps.\nMorphological data is included based on the latest international\ngastroenterology classification references such as Paris, Pit and JNET\nclassification. Pathological data includes the diagnosis of the polyps\nincluding Tubular, Villous, Tubulovillous, Hyperplastic, Serrated, Inflammatory\nand Adenocarcinoma with Dysplasia Grade & Differentiation. The current version\nof this dataset is published and available on Elsevier Mendeley Dataverse and\nsince it is under development, the latest version is accessible via:\nhttps://databiox.com.\n","authors":["Mojgan Forootan","Mohsen Rajabnia","Ahmad R Mafi","Hamed Azhdari Tehrani","Erfan Ghadirzadeh","Mahziar Setayeshfar","Zahra Ghaffari","Mohammad Tashakoripour","Mohammad Reza Zali","Hamidreza Bolhasani"],"pdf_url":"https://arxiv.org/pdf/2307.15444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09329v2","updated":"2023-07-28T09:50:23Z","published":"2023-07-18T15:11:40Z","title":"Towards a performance analysis on pre-trained Visual Question Answering\n  models for autonomous driving","summary":"  This short paper presents a preliminary analysis of three popular Visual\nQuestion Answering (VQA) models, namely ViLBERT, ViLT, and LXMERT, in the\ncontext of answering questions relating to driving scenarios. The performance\nof these models is evaluated by comparing the similarity of responses to\nreference answers provided by computer vision experts. Model selection is\npredicated on the analysis of transformer utilization in multimodal\narchitectures. The results indicate that models incorporating cross-modal\nattention and late fusion techniques exhibit promising potential for generating\nimproved answers within a driving perspective. This initial analysis serves as\na launchpad for a forthcoming comprehensive comparative study involving nine\nVQA models and sets the scene for further investigations into the effectiveness\nof VQA model queries in self-driving scenarios. Supplementary material is\navailable at\nhttps://github.com/KaavyaRekanar/Towards-a-performance-analysis-on-pre-trained-VQA-models-for-autonomous-driving.\n","authors":["Kaavya Rekanar","Ciarán Eising","Ganesh Sistu","Martin Hayes"],"pdf_url":"https://arxiv.org/pdf/2307.09329v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07047v2","updated":"2023-07-28T09:50:04Z","published":"2023-04-14T10:44:59Z","title":"Near Field iToF LIDAR Depth Improvement from Limited Number of Shots","summary":"  Indirect Time of Flight LiDARs can indirectly calculate the scene's depth\nfrom the phase shift angle between transmitted and received laser signals with\namplitudes modulated at a predefined frequency. Unfortunately, this method\ngenerates ambiguity in calculated depth when the phase shift angle value\nexceeds $2\\pi$. Current state-of-the-art methods use raw samples generated\nusing two distinct modulation frequencies to overcome this ambiguity problem.\nHowever, this comes at the cost of increasing laser components' stress and\nraising their temperature, which reduces their lifetime and increases power\nconsumption. In our work, we study two different methods to recover the entire\ndepth range of the LiDAR using fewer raw data sample shots from a single\nmodulation frequency with the support of sensor's gray scale output to reduce\nthe laser components' stress and power consumption.\n","authors":["Mena Nagiub","Thorsten Beuth","Ganesh Sistu","Heinrich Gotzig","Ciarán Eising"],"pdf_url":"https://arxiv.org/pdf/2304.07047v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15433v1","updated":"2023-07-28T09:31:36Z","published":"2023-07-28T09:31:36Z","title":"Automated Visual Monitoring of Nocturnal Insects with Light-based Camera\n  Traps","summary":"  Automatic camera-assisted monitoring of insects for abundance estimations is\ncrucial to understand and counteract ongoing insect decline. In this paper, we\npresent two datasets of nocturnal insects, especially moths as a subset of\nLepidoptera, photographed in Central Europe. One of the datasets, the EU-Moths\ndataset, was captured manually by citizen scientists and contains species\nannotations for 200 different species and bounding box annotations for those.\nWe used this dataset to develop and evaluate a two-stage pipeline for insect\ndetection and moth species classification in previous work. We further\nintroduce a prototype for an automated visual monitoring system. This prototype\nproduced the second dataset consisting of more than 27,000 images captured on\n95 nights. For evaluation and bootstrapping purposes, we annotated a subset of\nthe images with bounding boxes enframing nocturnal insects. Finally, we present\nfirst detection and classification baselines for these datasets and encourage\nother scientists to use this publicly available data.\n","authors":["Dimitri Korsch","Paul Bodesheim","Gunnar Brehm","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2307.15433v1.pdf","comment":"Presented at the FGVC workshop at the CVPR2022"},{"id":"http://arxiv.org/abs/2307.12721v3","updated":"2023-07-28T09:31:31Z","published":"2023-07-24T12:03:50Z","title":"AMAE: Adaptation of Pre-Trained Masked Autoencoder for Dual-Distribution\n  Anomaly Detection in Chest X-Rays","summary":"  Unsupervised anomaly detection in medical images such as chest radiographs is\nstepping into the spotlight as it mitigates the scarcity of the labor-intensive\nand costly expert annotation of anomaly data. However, nearly all existing\nmethods are formulated as a one-class classification trained only on\nrepresentations from the normal class and discard a potentially significant\nportion of the unlabeled data. This paper focuses on a more practical setting,\ndual distribution anomaly detection for chest X-rays, using the entire training\ndata, including both normal and unlabeled images. Inspired by a modern\nself-supervised vision transformer model trained using partial image inputs to\nreconstruct missing image regions -- we propose AMAE, a two-stage algorithm for\nadaptation of the pre-trained masked autoencoder (MAE). Starting from MAE\ninitialization, AMAE first creates synthetic anomalies from only normal\ntraining images and trains a lightweight classifier on frozen transformer\nfeatures. Subsequently, we propose an adaptation strategy to leverage unlabeled\nimages containing anomalies. The adaptation scheme is accomplished by assigning\npseudo-labels to unlabeled images and using two separate MAE based modules to\nmodel the normative and anomalous distributions of pseudo-labeled images. The\neffectiveness of the proposed adaptation strategy is evaluated with different\nanomaly ratios in an unlabeled training set. AMAE leads to consistent\nperformance gains over competing self-supervised and dual distribution anomaly\ndetection methods, setting the new state-of-the-art on three public chest X-ray\nbenchmarks: RSNA, NIH-CXR, and VinDr-CXR.\n","authors":["Behzad Bozorgtabar","Dwarikanath Mahapatra","Jean-Philippe Thiran"],"pdf_url":"https://arxiv.org/pdf/2307.12721v3.pdf","comment":"To be presented at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.15429v1","updated":"2023-07-28T09:26:03Z","published":"2023-07-28T09:26:03Z","title":"Improvable Gap Balancing for Multi-Task Learning","summary":"  In multi-task learning (MTL), gradient balancing has recently attracted more\nresearch interest than loss balancing since it often leads to better\nperformance. However, loss balancing is much more efficient than gradient\nbalancing, and thus it is still worth further exploration in MTL. Note that\nprior studies typically ignore that there exist varying improvable gaps across\nmultiple tasks, where the improvable gap per task is defined as the distance\nbetween the current training progress and desired final training progress.\nTherefore, after loss balancing, the performance imbalance still arises in many\ncases. In this paper, following the loss balancing framework, we propose two\nnovel improvable gap balancing (IGB) algorithms for MTL: one takes a simple\nheuristic, and the other (for the first time) deploys deep reinforcement\nlearning for MTL. Particularly, instead of directly balancing the losses in\nMTL, both algorithms choose to dynamically assign task weights for improvable\ngap balancing. Moreover, we combine IGB and gradient balancing to show the\ncomplementarity between the two types of algorithms. Extensive experiments on\ntwo benchmark datasets demonstrate that our IGB algorithms lead to the best\nresults in MTL via loss balancing and achieve further improvements when\ncombined with gradient balancing. Code is available at\nhttps://github.com/YanqiDai/IGB4MTL.\n","authors":["Yanqi Dai","Nanyi Fei","Zhiwu Lu"],"pdf_url":"https://arxiv.org/pdf/2307.15429v1.pdf","comment":"Accepted for the 39th Conference on Uncertainty in Artificial\n  Intelligence (UAI 2023)"},{"id":"http://arxiv.org/abs/2307.15428v1","updated":"2023-07-28T09:26:00Z","published":"2023-07-28T09:26:00Z","title":"Implicit neural representation for change detection","summary":"  Detecting changes that occurred in a pair of 3D airborne LiDAR point clouds,\nacquired at two different times over the same geographical area, is a\nchallenging task because of unmatching spatial supports and acquisition system\nnoise. Most recent attempts to detect changes on point clouds are based on\nsupervised methods, which require large labelled data unavailable in real-world\napplications. To address these issues, we propose an unsupervised approach that\ncomprises two components: Neural Field (NF) for continuous shape reconstruction\nand a Gaussian Mixture Model for categorising changes. NF offer a grid-agnostic\nrepresentation to encode bi-temporal point clouds with unmatched spatial\nsupport that can be regularised to increase high-frequency details and reduce\nnoise. The reconstructions at each timestamp are compared at arbitrary spatial\nscales, leading to a significant increase in detection capabilities. We apply\nour method to a benchmark dataset of simulated LiDAR point clouds for urban\nsprawling. The dataset offers different challenging scenarios with different\nresolutions, input modalities and noise levels, allowing a multi-scenario\ncomparison of our method with the current state-of-the-art. We boast the\nprevious methods on this dataset by a 10% margin in intersection over union\nmetric. In addition, we apply our methods to a real-world scenario to identify\nillegal excavation (looting) of archaeological sites and confirm that they\nmatch findings from field experts.\n","authors":["Peter Naylor","Diego Di Carlo","Arianna Traviglia","Makoto Yamada","Marco Fiorucci"],"pdf_url":"https://arxiv.org/pdf/2307.15428v1.pdf","comment":"Main article is 10 pages + 3 pages of supplementary. Conference style\n  paper"},{"id":"http://arxiv.org/abs/2307.15427v1","updated":"2023-07-28T09:22:09Z","published":"2023-07-28T09:22:09Z","title":"Deep Learning Pipeline for Automated Visual Moth Monitoring: Insect\n  Localization and Species Classification","summary":"  Biodiversity monitoring is crucial for tracking and counteracting adverse\ntrends in population fluctuations. However, automatic recognition systems are\nrarely applied so far, and experts evaluate the generated data masses manually.\nEspecially the support of deep learning methods for visual monitoring is not\nyet established in biodiversity research, compared to other areas like\nadvertising or entertainment. In this paper, we present a deep learning\npipeline for analyzing images captured by a moth scanner, an automated visual\nmonitoring system of moth species developed within the AMMOD project. We first\nlocalize individuals with a moth detector and afterward determine the species\nof detected insects with a classifier. Our detector achieves up to 99.01% mean\naverage precision and our classifier distinguishes 200 moth species with an\naccuracy of 93.13% on image cutouts depicting single insects. Combining both in\nour pipeline improves the accuracy for species identification in images of the\nmoth scanner from 79.62% to 88.05%.\n","authors":["Dimitri Korsch","Paul Bodesheim","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2307.15427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15421v1","updated":"2023-07-28T09:11:37Z","published":"2023-07-28T09:11:37Z","title":"MLIC++: Linear Complexity Multi-Reference Entropy Modeling for Learned\n  Image Compression","summary":"  Recently, multi-reference entropy model has been proposed, which captures\nchannel-wise, local spatial, and global spatial correlations. Previous works\nadopt attention for global correlation capturing, however, the quadratic\ncpmplexity limits the potential of high-resolution image coding. In this paper,\nwe propose the linear complexity global correlations capturing, via the\ndecomposition of softmax operation. Based on it, we propose the MLIC$^{++}$, a\nlearned image compression with linear complexity for multi-reference entropy\nmodeling. Our MLIC$^{++}$ is more efficient and it reduces BD-rate by 12.44% on\nthe Kodak dataset compared to VTM-17.0 when measured in PSNR. Code will be\navailable at https://github.com/JiangWeibeta/MLIC.\n","authors":["Wei Jiang","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.15421v1.pdf","comment":"Accepted at ICML 2023 Neural Compression Workshop. Extension work of\n  our ACMMM 2023 paper MLIC: Multi-Reference Entropy Model for Learned Image\n  Compression"},{"id":"http://arxiv.org/abs/2307.15409v1","updated":"2023-07-28T09:03:06Z","published":"2023-07-28T09:03:06Z","title":"Uncertainty-aware Unsupervised Multi-Object Tracking","summary":"  Without manually annotated identities, unsupervised multi-object trackers are\ninferior to learning reliable feature embeddings. It causes the\nsimilarity-based inter-frame association stage also be error-prone, where an\nuncertainty problem arises. The frame-by-frame accumulated uncertainty prevents\ntrackers from learning the consistent feature embedding against time variation.\nTo avoid this uncertainty problem, recent self-supervised techniques are\nadopted, whereas they failed to capture temporal relations. The interframe\nuncertainty still exists. In fact, this paper argues that though the\nuncertainty problem is inevitable, it is possible to leverage the uncertainty\nitself to improve the learned consistency in turn. Specifically, an\nuncertainty-based metric is developed to verify and rectify the risky\nassociations. The resulting accurate pseudo-tracklets boost learning the\nfeature consistency. And accurate tracklets can incorporate temporal\ninformation into spatial transformation. This paper proposes a tracklet-guided\naugmentation strategy to simulate tracklets' motion, which adopts a\nhierarchical uncertainty-based sampling mechanism for hard sample mining. The\nultimate unsupervised MOT framework, namely U2MOT, is proven effective on\nMOT-Challenges and VisDrone-MOT benchmark. U2MOT achieves a SOTA performance\namong the published supervised and unsupervised trackers.\n","authors":["Kai Liu","Sheng Jin","Zhihang Fu","Ze Chen","Rongxin Jiang","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2307.15409v1.pdf","comment":"Accepted by International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2307.15381v1","updated":"2023-07-28T08:05:36Z","published":"2023-07-28T08:05:36Z","title":"AffineGlue: Joint Matching and Robust Estimation","summary":"  We propose AffineGlue, a method for joint two-view feature matching and\nrobust estimation that reduces the combinatorial complexity of the problem by\nemploying single-point minimal solvers. AffineGlue selects potential matches\nfrom one-to-many correspondences to estimate minimal models. Guided matching is\nthen used to find matches consistent with the model, suffering less from the\nambiguities of one-to-one matches. Moreover, we derive a new minimal solver for\nhomography estimation, requiring only a single affine correspondence (AC) and a\ngravity prior. Furthermore, we train a neural network to reject ACs that are\nunlikely to lead to a good model. AffineGlue is superior to the SOTA on\nreal-world datasets, even when assuming that the gravity direction points\ndownwards. On PhotoTourism, the AUC@10{\\deg} score is improved by 6.6 points\ncompared to the SOTA. On ScanNet, AffineGlue makes SuperPoint and SuperGlue\nachieve similar accuracy as the detector-free LoFTR.\n","authors":["Daniel Barath","Dmytro Mishkin","Luca Cavalli","Paul-Edouard Sarlin","Petr Hruby","Marc Pollefeys"],"pdf_url":"https://arxiv.org/pdf/2307.15381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13445v2","updated":"2023-07-28T07:49:26Z","published":"2023-04-26T11:02:04Z","title":"Neural-PBIR Reconstruction of Shape, Material, and Illumination","summary":"  Reconstructing the shape and spatially varying surface appearances of a\nphysical-world object as well as its surrounding illumination based on 2D\nimages (e.g., photographs) of the object has been a long-standing problem in\ncomputer vision and graphics. In this paper, we introduce a robust object\nreconstruction pipeline combining neural based object reconstruction and\nphysics-based inverse rendering (PBIR). Specifically, our pipeline firstly\nleverages a neural stage to produce high-quality but potentially imperfect\npredictions of object shape, reflectance, and illumination. Then, in the later\nstage, initialized by the neural predictions, we perform PBIR to refine the\ninitial results and obtain the final high-quality reconstruction. Experimental\nresults demonstrate our pipeline significantly outperforms existing\nreconstruction methods quality-wise and performance-wise.\n","authors":["Cheng Sun","Guangyan Cai","Zhengqin Li","Kai Yan","Cheng Zhang","Carl Marshall","Jia-Bin Huang","Shuang Zhao","Zhao Dong"],"pdf_url":"https://arxiv.org/pdf/2304.13445v2.pdf","comment":"Project page at https://neural-pbir.github.io/"},{"id":"http://arxiv.org/abs/2307.15362v1","updated":"2023-07-28T07:25:57Z","published":"2023-07-28T07:25:57Z","title":"Prompt Guided Transformer for Multi-Task Dense Prediction","summary":"  Task-conditional architecture offers advantage in parameter efficiency but\nfalls short in performance compared to state-of-the-art multi-decoder methods.\nHow to trade off performance and model parameters is an important and difficult\nproblem. In this paper, we introduce a simple and lightweight task-conditional\nmodel called Prompt Guided Transformer (PGT) to optimize this challenge. Our\napproach designs a Prompt-conditioned Transformer block, which incorporates\ntask-specific prompts in the self-attention mechanism to achieve global\ndependency modeling and parameter-efficient feature adaptation across multiple\ntasks. This block is integrated into both the shared encoder and decoder,\nenhancing the capture of intra- and inter-task features. Moreover, we design a\nlightweight decoder to further reduce parameter usage, which accounts for only\n2.7% of the total model parameters. Extensive experiments on two multi-task\ndense prediction benchmarks, PASCAL-Context and NYUD-v2, demonstrate that our\napproach achieves state-of-the-art results among task-conditional methods while\nusing fewer parameters, and maintains a significant balance between performance\nand parameter size.\n","authors":["Yuxiang Lu","Shalayiding Sirejiding","Yue Ding","Chunlin Wang","Hongtao Lu"],"pdf_url":"https://arxiv.org/pdf/2307.15362v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2007.02080v2","updated":"2023-07-28T07:17:05Z","published":"2020-07-04T12:17:25Z","title":"End-to-end Learning of a Fisher Vector Encoding for Part Features in\n  Fine-grained Recognition","summary":"  Part-based approaches for fine-grained recognition do not show the expected\nperformance gain over global methods, although explicitly focusing on small\ndetails that are relevant for distinguishing highly similar classes. We assume\nthat part-based methods suffer from a missing representation of local features,\nwhich is invariant to the order of parts and can handle a varying number of\nvisible parts appropriately. The order of parts is artificial and often only\ngiven by ground-truth annotations, whereas viewpoint variations and occlusions\nresult in not observable parts. Therefore, we propose integrating a Fisher\nvector encoding of part features into convolutional neural networks. The\nparameters for this encoding are estimated by an online EM algorithm jointly\nwith those of the neural network and are more precise than the estimates of\nprevious works. Our approach improves state-of-the-art accuracies for three\nbird species classification datasets.\n","authors":["Dimitri Korsch","Paul Bodesheim","Joachim Denzler"],"pdf_url":"https://arxiv.org/pdf/2007.02080v2.pdf","comment":"Published in the proceedings of the German Conference on Pattern\n  Recognition 2021 (GCPR21)"},{"id":"http://arxiv.org/abs/2307.15353v1","updated":"2023-07-28T07:03:18Z","published":"2023-07-28T07:03:18Z","title":"Supervised Homography Learning with Realistic Dataset Generation","summary":"  In this paper, we propose an iterative framework, which consists of two\nphases: a generation phase and a training phase, to generate realistic training\ndata and yield a supervised homography network. In the generation phase, given\nan unlabeled image pair, we utilize the pre-estimated dominant plane masks and\nhomography of the pair, along with another sampled homography that serves as\nground truth to generate a new labeled training pair with realistic motion. In\nthe training phase, the generated data is used to train the supervised\nhomography network, in which the training data is refined via a content\nconsistency module and a quality assessment module. Once an iteration is\nfinished, the trained network is used in the next data generation phase to\nupdate the pre-estimated homography. Through such an iterative strategy, the\nquality of the dataset and the performance of the network can be gradually and\nsimultaneously improved. Experimental results show that our method achieves\nstate-of-the-art performance and existing supervised methods can be also\nimproved based on the generated dataset. Code and dataset are available at\nhttps://github.com/megvii-research/RealSH.\n","authors":["Hai Jiang","Haipeng Li","Songchen Han","Haoqiang Fan","Bing Zeng","Shuaicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2307.15353v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2212.02073v2","updated":"2023-07-28T06:51:35Z","published":"2022-12-05T07:37:32Z","title":"Minimum Latency Deep Online Video Stabilization","summary":"  We present a novel camera path optimization framework for the task of online\nvideo stabilization. Typically, a stabilization pipeline consists of three\nsteps: motion estimating, path smoothing, and novel view rendering. Most\nprevious methods concentrate on motion estimation, proposing various global or\nlocal motion models. In contrast, path optimization receives relatively less\nattention, especially in the important online setting, where no future frames\nare available. In this work, we adopt recent off-the-shelf high-quality deep\nmotion models for motion estimation to recover the camera trajectory and focus\non the latter two steps. Our network takes a short 2D camera path in a sliding\nwindow as input and outputs the stabilizing warp field of the last frame in the\nwindow, which warps the coming frame to its stabilized position. A hybrid loss\nis well-defined to constrain the spatial and temporal consistency. In addition,\nwe build a motion dataset that contains stable and unstable motion pairs for\nthe training. Extensive experiments demonstrate that our approach significantly\noutperforms state-of-the-art online methods both qualitatively and\nquantitatively and achieves comparable performance to offline methods. Our code\nand dataset are available at https://github.com/liuzhen03/NNDVS\n","authors":["Zhuofan Zhang","Zhen Liu","Ping Tan","Bing Zeng","Shuaicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2212.02073v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.15339v1","updated":"2023-07-28T06:32:33Z","published":"2023-07-28T06:32:33Z","title":"The Radon Signed Cumulative Distribution Transform and its applications\n  in classification of Signed Images","summary":"  Here we describe a new image representation technique based on the\nmathematics of transport and optimal transport. The method relies on the\ncombination of the well-known Radon transform for images and a recent signal\nrepresentation method called the Signed Cumulative Distribution Transform. The\nnewly proposed method generalizes previous transport-related image\nrepresentation methods to arbitrary functions (images), and thus can be used in\nmore applications. We describe the new transform, and some of its mathematical\nproperties and demonstrate its ability to partition image classes with real and\nsimulated data. In comparison to existing transport transform methods, as well\nas deep learning-based classification methods, the new transform more\naccurately represents the information content of signed images, and thus can be\nused to obtain higher classification accuracies. The implementation of the\nproposed method in Python language is integrated as a part of the software\npackage PyTransKit, available on Github.\n","authors":["Le Gong","Shiying Li","Naqib Sad Pathan","Mohammad Shifat-E-Rabbi","Gustavo K. Rohde","Abu Hasnat Mohammad Rubaiyat","Sumati Thareja"],"pdf_url":"https://arxiv.org/pdf/2307.15339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15335v1","updated":"2023-07-28T06:23:32Z","published":"2023-07-28T06:23:32Z","title":"BARTPhoBEiT: Pre-trained Sequence-to-Sequence and Image Transformers\n  Models for Vietnamese Visual Question Answering","summary":"  Visual Question Answering (VQA) is an intricate and demanding task that\nintegrates natural language processing (NLP) and computer vision (CV),\ncapturing the interest of researchers. The English language, renowned for its\nwealth of resources, has witnessed notable advancements in both datasets and\nmodels designed for VQA. However, there is a lack of models that target\nspecific countries such as Vietnam. To address this limitation, we introduce a\ntransformer-based Vietnamese model named BARTPhoBEiT. This model includes\npre-trained Sequence-to-Sequence and bidirectional encoder representation from\nImage Transformers in Vietnamese and evaluates Vietnamese VQA datasets.\nExperimental results demonstrate that our proposed model outperforms the strong\nbaseline and improves the state-of-the-art in six metrics: Accuracy, Precision,\nRecall, F1-score, WUPS 0.0, and WUPS 0.9.\n","authors":["Khiem Vinh Tran","Kiet Van Nguyen","Ngan Luu Thuy Nguyen"],"pdf_url":"https://arxiv.org/pdf/2307.15335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15333v1","updated":"2023-07-28T06:21:42Z","published":"2023-07-28T06:21:42Z","title":"Dynamic PlenOctree for Adaptive Sampling Refinement in Explicit NeRF","summary":"  The explicit neural radiance field (NeRF) has gained considerable interest\nfor its efficient training and fast inference capabilities, making it a\npromising direction such as virtual reality and gaming. In particular,\nPlenOctree (POT)[1], an explicit hierarchical multi-scale octree\nrepresentation, has emerged as a structural and influential framework. However,\nPOT's fixed structure for direct optimization is sub-optimal as the scene\ncomplexity evolves continuously with updates to cached color and density,\nnecessitating refining the sampling distribution to capture signal complexity\naccordingly. To address this issue, we propose the dynamic PlenOctree DOT,\nwhich adaptively refines the sample distribution to adjust to changing scene\ncomplexity. Specifically, DOT proposes a concise yet novel hierarchical feature\nfusion strategy during the iterative rendering process. Firstly, it identifies\nthe regions of interest through training signals to ensure adaptive and\nefficient refinement. Next, rather than directly filtering out valueless nodes,\nDOT introduces the sampling and pruning operations for octrees to aggregate\nfeatures, enabling rapid parameter learning. Compared with POT, our DOT\noutperforms it by enhancing visual quality, reducing over $55.15$/$68.84\\%$\nparameters, and providing 1.7/1.9 times FPS for NeRF-synthetic and Tanks $\\&$\nTemples, respectively. Project homepage:https://vlislab22.github.io/DOT.\n  [1] Yu, Alex, et al. \"Plenoctrees for real-time rendering of neural radiance\nfields.\" Proceedings of the IEEE/CVF International Conference on Computer\nVision. 2021.\n","authors":["Haotian Bai","Yiqi Lin","Yize Chen","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2307.15333v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2304.09507v2","updated":"2023-07-28T06:06:28Z","published":"2023-04-19T08:55:27Z","title":"Self-supervised Image Denoising with Downsampled Invariance Loss and\n  Conditional Blind-Spot Network","summary":"  There have been many image denoisers using deep neural networks, which\noutperform conventional model-based methods by large margins. Recently,\nself-supervised methods have attracted attention because constructing a large\nreal noise dataset for supervised training is an enormous burden. The most\nrepresentative self-supervised denoisers are based on blind-spot networks,\nwhich exclude the receptive field's center pixel. However, excluding any input\npixel is abandoning some information, especially when the input pixel at the\ncorresponding output position is excluded. In addition, a standard blind-spot\nnetwork fails to reduce real camera noise due to the pixel-wise correlation of\nnoise, though it successfully removes independently distributed synthetic\nnoise. Hence, to realize a more practical denoiser, we propose a novel\nself-supervised training framework that can remove real noise. For this, we\nderive the theoretic upper bound of a supervised loss where the network is\nguided by the downsampled blinded output. Also, we design a conditional\nblind-spot network (C-BSN), which selectively controls the blindness of the\nnetwork to use the center pixel information. Furthermore, we exploit a random\nsubsampler to decorrelate noise spatially, making the C-BSN free of visual\nartifacts that were often seen in downsample-based methods. Extensive\nexperiments show that the proposed C-BSN achieves state-of-the-art performance\non real-world datasets as a self-supervised denoiser and shows qualitatively\npleasing results without any post-processing or refinement.\n","authors":["Yeong Il Jang","Keuntek Lee","Gu Yong Park","Seyun Kim","Nam Ik Cho"],"pdf_url":"https://arxiv.org/pdf/2304.09507v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2307.15326v1","updated":"2023-07-28T06:04:46Z","published":"2023-07-28T06:04:46Z","title":"Staging E-Commerce Products for Online Advertising using Retrieval\n  Assisted Image Generation","summary":"  Online ads showing e-commerce products typically rely on the product images\nin a catalog sent to the advertising platform by an e-commerce platform. In the\nbroader ads industry such ads are called dynamic product ads (DPA). It is\ncommon for DPA catalogs to be in the scale of millions (corresponding to the\nscale of products which can be bought from the e-commerce platform). However,\nnot all product images in the catalog may be appealing when directly\nre-purposed as an ad image, and this may lead to lower click-through rates\n(CTRs). In particular, products just placed against a solid background may not\nbe as enticing and realistic as a product staged in a natural environment. To\naddress such shortcomings of DPA images at scale, we propose a generative\nadversarial network (GAN) based approach to generate staged backgrounds for\nun-staged product images. Generating the entire staged background is a\nchallenging task susceptible to hallucinations. To get around this, we\nintroduce a simpler approach called copy-paste staging using retrieval assisted\nGANs. In copy paste staging, we first retrieve (from the catalog) staged\nproducts similar to the un-staged input product, and then copy-paste the\nbackground of the retrieved product in the input image. A GAN based in-painting\nmodel is used to fill the holes left after this copy-paste operation. We show\nthe efficacy of our copy-paste staging method via offline metrics, and human\nevaluation. In addition, we show how our staging approach can enable animations\nof moving products leading to a video ad from a product image.\n","authors":["Yueh-Ning Ku","Mikhail Kuznetsov","Shaunak Mishra","Paloma de Juan"],"pdf_url":"https://arxiv.org/pdf/2307.15326v1.pdf","comment":"Accepted for publication in AdKDD 2023"},{"id":"http://arxiv.org/abs/2307.15324v1","updated":"2023-07-28T06:00:57Z","published":"2023-07-28T06:00:57Z","title":"TaskExpert: Dynamically Assembling Multi-Task Representations with\n  Memorial Mixture-of-Experts","summary":"  Learning discriminative task-specific features simultaneously for multiple\ndistinct tasks is a fundamental problem in multi-task learning. Recent\nstate-of-the-art models consider directly decoding task-specific features from\none shared task-generic feature (e.g., feature from a backbone layer), and\nutilize carefully designed decoders to produce multi-task features. However, as\nthe input feature is fully shared and each task decoder also shares decoding\nparameters for different input samples, it leads to a static feature decoding\nprocess, producing less discriminative task-specific representations. To tackle\nthis limitation, we propose TaskExpert, a novel multi-task mixture-of-experts\nmodel that enables learning multiple representative task-generic feature spaces\nand decoding task-specific features in a dynamic manner. Specifically,\nTaskExpert introduces a set of expert networks to decompose the backbone\nfeature into several representative task-generic features. Then, the\ntask-specific features are decoded by using dynamic task-specific gating\nnetworks operating on the decomposed task-generic features. Furthermore, to\nestablish long-range modeling of the task-specific representations from\ndifferent layers of TaskExpert, we design a multi-task feature memory that\nupdates at each layer and acts as an additional feature expert for dynamic\ntask-specific feature decoding. Extensive experiments demonstrate that our\nTaskExpert clearly outperforms previous best-performing methods on all 9\nmetrics of two competitive multi-task learning benchmarks for visual scene\nunderstanding (i.e., PASCAL-Context and NYUD-v2). Codes and models will be made\npublicly available at https://github.com/prismformore/Multi-Task-Transformer\n","authors":["Hanrong Ye","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.15324v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14750v2","updated":"2023-07-28T05:53:33Z","published":"2023-07-27T10:16:13Z","title":"Exploring Annotation-free Image Captioning with Retrieval-augmented\n  Pseudo Sentence Generation","summary":"  Training an image captioner without annotated image-sentence pairs has gained\ntraction in recent years. Previous approaches can be categorized into two\nstrategies: crawling sentences from mismatching corpora and aligning them with\nthe given images as pseudo annotations, or pre-training the captioner using\nexternal image-text pairs. However, the aligning setting seems to reach its\nperformance limit due to the quality problem of pairs, and pre-training\nrequires significant computational resources. To address these challenges, we\npropose a new strategy ``LPM + retrieval-augmented learning\" where the prior\nknowledge from large pre-trained models (LPMs) is leveraged as supervision, and\na retrieval process is integrated to further reinforce its effectiveness.\nSpecifically, we introduce Retrieval-augmented Pseudo Sentence Generation\n(RaPSG), which adopts an efficient approach to retrieve highly relevant short\nregion descriptions from the mismatching corpora and use them to generate a\nvariety of pseudo sentences with distinct representations as well as high\nquality via LPMs. In addition, a fluency filter and a CLIP-guided training\nobjective are further introduced to facilitate model optimization. Experimental\nresults demonstrate that our method surpasses the SOTA pre-training model\n(Flamingo3B) by achieving a CIDEr score of 78.1 (+5.1) while utilizing only\n0.3% of its trainable parameters (1.3B VS 33M). Importantly, our approach\neliminates the need of computationally expensive pre-training processes on\nexternal datasets (e.g., the requirement of 312M image-text pairs for\nFlamingo3B). We further show that with a simple extension, the generated pseudo\nsentences can be deployed as weak supervision to boost the 1% semi-supervised\nimage caption benchmark up to 93.4 CIDEr score (+8.9) which showcases the\nversatility and effectiveness of our approach.\n","authors":["Zhiyuan Li","Dongnan Liu","Heng Wang","Chaoyi Zhang","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2307.14750v2.pdf","comment":"10 pages 5 figures"},{"id":"http://arxiv.org/abs/2307.15320v1","updated":"2023-07-28T05:47:24Z","published":"2023-07-28T05:47:24Z","title":"Robust Visual Sim-to-Real Transfer for Robotic Manipulation","summary":"  Learning visuomotor policies in simulation is much safer and cheaper than in\nthe real world. However, due to discrepancies between the simulated and real\ndata, simulator-trained policies often fail when transferred to real robots.\nOne common approach to bridge the visual sim-to-real domain gap is domain\nrandomization (DR). While previous work mainly evaluates DR for disembodied\ntasks, such as pose estimation and object detection, here we systematically\nexplore visual domain randomization methods and benchmark them on a rich set of\nchallenging robotic manipulation tasks. In particular, we propose an off-line\nproxy task of cube localization to select DR parameters for texture\nrandomization, lighting randomization, variations of object colors and camera\nparameters. Notably, we demonstrate that DR parameters have similar impact on\nour off-line proxy task and on-line policies. We, hence, use off-line optimized\nDR parameters to train visuomotor policies in simulation and directly apply\nsuch policies to a real robot. Our approach achieves 93% success rate on\naverage when tested on a diverse set of challenging manipulation tasks.\nMoreover, we evaluate the robustness of policies to visual variations in real\nscenes and show that our simulator-trained policies outperform policies learned\nusing real but limited data. Code, simulation environment, real robot datasets\nand trained models are available at\nhttps://www.di.ens.fr/willow/research/robust_s2r/.\n","authors":["Ricardo Garcia","Robin Strudel","Shizhe Chen","Etienne Arlaud","Ivan Laptev","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2307.15320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15318v1","updated":"2023-07-28T05:35:37Z","published":"2023-07-28T05:35:37Z","title":"DocDeshadower: Frequency-aware Transformer for Document Shadow Removal","summary":"  The presence of shadows significantly impacts the visual quality of scanned\ndocuments. However, the existing traditional techniques and deep learning\nmethods used for shadow removal have several limitations. These methods either\nrely heavily on heuristics, resulting in suboptimal performance, or require\nlarge datasets to learn shadow-related features. In this study, we propose the\nDocDeshadower, a multi-frequency Transformer-based model built on Laplacian\nPyramid. DocDeshadower is designed to remove shadows at different frequencies\nin a coarse-to-fine manner. To achieve this, we decompose the shadow image into\ndifferent frequency bands using Laplacian Pyramid. In addition, we introduce\ntwo novel components to this model: the Attention-Aggregation Network and the\nGated Multi-scale Fusion Transformer. The Attention-Aggregation Network is\ndesigned to remove shadows in the low-frequency part of the image, whereas the\nGated Multi-scale Fusion Transformer refines the entire image at a global scale\nwith its large perceptive field. Our extensive experiments demonstrate that\nDocDeshadower outperforms the current state-of-the-art methods in both\nqualitative and quantitative terms.\n","authors":["Shenghong Luo","Ruifeng Xu","Xuhang Chen","Zinuo Li","Chi-Man Pun","Shuqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.15318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15317v1","updated":"2023-07-28T05:32:56Z","published":"2023-07-28T05:32:56Z","title":"DiffKendall: A Novel Approach for Few-Shot Learning with Differentiable\n  Kendall's Rank Correlation","summary":"  Few-shot learning aims to adapt models trained on the base dataset to novel\ntasks where the categories are not seen by the model before. This often leads\nto a relatively uniform distribution of feature values across channels on novel\nclasses, posing challenges in determining channel importance for novel tasks.\nStandard few-shot learning methods employ geometric similarity metrics such as\ncosine similarity and negative Euclidean distance to gauge the semantic\nrelatedness between two features. However, features with high geometric\nsimilarities may carry distinct semantics, especially in the context of\nfew-shot learning. In this paper, we demonstrate that the importance ranking of\nfeature channels is a more reliable indicator for few-shot learning than\ngeometric similarity metrics. We observe that replacing the geometric\nsimilarity metric with Kendall's rank correlation only during inference is able\nto improve the performance of few-shot learning across a wide range of datasets\nwith different domains. Furthermore, we propose a carefully designed\ndifferentiable loss for meta-training to address the non-differentiability\nissue of Kendall's rank correlation. Extensive experiments demonstrate that the\nproposed rank-correlation-based approach substantially enhances few-shot\nlearning performance.\n","authors":["Kaipeng Zheng","Huishuai Zhang","Weiran Huang"],"pdf_url":"https://arxiv.org/pdf/2307.15317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14095v2","updated":"2023-07-28T04:53:58Z","published":"2023-03-24T16:00:01Z","title":"PanoVPR: Towards Unified Perspective-to-Equirectangular Visual Place\n  Recognition via Sliding Windows across the Panoramic View","summary":"  Visual place recognition has gained significant attention in recent years as\na crucial technology in autonomous driving and robotics. Currently, the two\nmain approaches are the perspective view retrieval (P2P) paradigm and the\nequirectangular image retrieval (E2E) paradigm. However, it is practical and\nnatural to assume that users only have consumer-grade pinhole cameras to obtain\nquery perspective images and retrieve them in panoramic database images from\nmap providers. To address this, we propose \\textit{PanoVPR}, a\nperspective-to-equirectangular (P2E) visual place recognition framework that\nemploys sliding windows to eliminate feature truncation caused by hard\ncropping. Specifically, PanoVPR slides windows over the entire equirectangular\nimage and computes feature descriptors for each window, which are then compared\nto determine place similarity. Notably, our unified framework enables direct\ntransfer of the backbone from P2P methods without any modification, supporting\nnot only CNNs but also Transformers. To facilitate training and evaluation, we\nderive the Pitts250k-P2E dataset from the Pitts250k and establish YQ360, latter\nis the first P2E visual place recognition dataset collected by a mobile robot\nplatform aiming to simulate real-world task scenarios better. Extensive\nexperiments demonstrate that PanoVPR achieves state-of-the-art performance and\nobtains 3.8% and 8.0% performance gain on Pitts250k-P2E and YQ360 compared to\nthe previous best method, respectively. Code and datasets will be publicly\navailable at https://github.com/zafirshi/PanoVPR.\n","authors":["Ze Shi","Hao Shi","Kailun Yang","Zhe Yin","Yining Lin","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2303.14095v2.pdf","comment":"Accepted to ITSC 2023. Code and datasets will be made available at\n  https://github.com/zafirshi/PanoVPR"},{"id":"http://arxiv.org/abs/2307.15301v1","updated":"2023-07-28T04:36:07Z","published":"2023-07-28T04:36:07Z","title":"Attentive Multimodal Fusion for Optical and Scene Flow","summary":"  This paper presents an investigation into the estimation of optical and scene\nflow using RGBD information in scenarios where the RGB modality is affected by\nnoise or captured in dark environments. Existing methods typically rely solely\non RGB images or fuse the modalities at later stages, which can result in lower\naccuracy when the RGB information is unreliable. To address this issue, we\npropose a novel deep neural network approach named FusionRAFT, which enables\nearly-stage information fusion between sensor modalities (RGB and depth). Our\napproach incorporates self- and cross-attention layers at different network\nlevels to construct informative features that leverage the strengths of both\nmodalities. Through comparative experiments, we demonstrate that our approach\noutperforms recent methods in terms of performance on the synthetic dataset\nFlyingthings3D, as well as the generalization on the real-world dataset KITTI.\nWe illustrate that our approach exhibits improved robustness in the presence of\nnoise and low-lighting conditions that affect the RGB images. We release the\ncode, models and dataset at https://github.com/jiesico/FusionRAFT.\n","authors":["Youjie Zhou","Guofeng Mei","Yiming Wang","Fabio Poiesi","Yi Wan"],"pdf_url":"https://arxiv.org/pdf/2307.15301v1.pdf","comment":"This work is accepted for publication in IEEE Robotics and Automation\n  Letters"},{"id":"http://arxiv.org/abs/2307.15282v1","updated":"2023-07-28T03:27:25Z","published":"2023-07-28T03:27:25Z","title":"AC-Norm: Effective Tuning for Medical Image Analysis via Affine\n  Collaborative Normalization","summary":"  Driven by the latest trend towards self-supervised learning (SSL), the\nparadigm of \"pretraining-then-finetuning\" has been extensively explored to\nenhance the performance of clinical applications with limited annotations.\nPrevious literature on model finetuning has mainly focused on regularization\nterms and specific policy models, while the misalignment of channels between\nsource and target models has not received sufficient attention. In this work,\nwe revisited the dynamics of batch normalization (BN) layers and observed that\nthe trainable affine parameters of BN serve as sensitive indicators of domain\ninformation. Therefore, Affine Collaborative Normalization (AC-Norm) is\nproposed for finetuning, which dynamically recalibrates the channels in the\ntarget model according to the cross-domain channel-wise correlations without\nadding extra parameters. Based on a single-step backpropagation, AC-Norm can\nalso be utilized to measure the transferability of pretrained models. We\nevaluated AC-Norm against the vanilla finetuning and state-of-the-art\nfine-tuning methods on transferring diverse pretrained models to the diabetic\nretinopathy grade classification, retinal vessel segmentation, CT lung nodule\nsegmentation/classification, CT liver-tumor segmentation and MRI cardiac\nsegmentation tasks. Extensive experiments demonstrate that AC-Norm unanimously\noutperforms the vanilla finetuning by up to 4% improvement, even under\nsignificant domain shifts where the state-of-the-art methods bring no gains. We\nalso prove the capability of AC-Norm in fast transferability estimation. Our\ncode is available at https://github.com/EndoluminalSurgicalVision-IMR/ACNorm.\n","authors":["Chuyan Zhang","Yuncheng Yang","Hao Zheng","Yun Gu"],"pdf_url":"https://arxiv.org/pdf/2307.15282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15273v1","updated":"2023-07-28T02:47:34Z","published":"2023-07-28T02:47:34Z","title":"Recovering high-quality FODs from a reduced number of diffusion-weighted\n  images using a model-driven deep learning architecture","summary":"  Fibre orientation distribution (FOD) reconstruction using deep learning has\nthe potential to produce accurate FODs from a reduced number of\ndiffusion-weighted images (DWIs), decreasing total imaging time. Diffusion\nacquisition invariant representations of the DWI signals are typically used as\ninput to these methods to ensure that they can be applied flexibly to data with\ndifferent b-vectors and b-values; however, this means the network cannot\ncondition its output directly on the DWI signal. In this work, we propose a\nspherical deconvolution network, a model-driven deep learning FOD\nreconstruction architecture, that ensures intermediate and output FODs produced\nby the network are consistent with the input DWI signals. Furthermore, we\nimplement a fixel classification penalty within our loss function, encouraging\nthe network to produce FODs that can subsequently be segmented into the correct\nnumber of fixels and improve downstream fixel-based analysis. Our results show\nthat the model-based deep learning architecture achieves competitive\nperformance compared to a state-of-the-art FOD super-resolution network,\nFOD-Net. Moreover, we show that the fixel classification penalty can be tuned\nto offer improved performance with respect to metrics that rely on accurately\nsegmented of FODs. Our code is publicly available at\nhttps://github.com/Jbartlett6/SDNet .\n","authors":["J Bartlett","C E Davey","L A Johnston","J Duan"],"pdf_url":"https://arxiv.org/pdf/2307.15273v1.pdf","comment":"10 pages, 7 figures, This work has been submitted to the IEEE for\n  possible publication. Copyright may be transferred without notice, after\n  which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2307.15271v1","updated":"2023-07-28T02:41:41Z","published":"2023-07-28T02:41:41Z","title":"Anatomy-Aware Lymph Node Detection in Chest CT using Implicit Station\n  Stratification","summary":"  Finding abnormal lymph nodes in radiological images is highly important for\nvarious medical tasks such as cancer metastasis staging and radiotherapy\nplanning. Lymph nodes (LNs) are small glands scattered throughout the body.\nThey are grouped or defined to various LN stations according to their\nanatomical locations. The CT imaging appearance and context of LNs in different\nstations vary significantly, posing challenges for automated detection,\nespecially for pathological LNs. Motivated by this observation, we propose a\nnovel end-to-end framework to improve LN detection performance by leveraging\ntheir station information. We design a multi-head detector and make each head\nfocus on differentiating the LN and non-LN structures of certain stations.\nPseudo station labels are generated by an LN station classifier as a form of\nmulti-task learning during training, so we do not need another explicit LN\nstation prediction model during inference. Our algorithm is evaluated on 82\npatients with lung cancer and 91 patients with esophageal cancer. The proposed\nimplicit station stratification method improves the detection sensitivity of\nthoracic lymph nodes from 65.1% to 71.4% and from 80.3% to 85.5% at 2 false\npositives per patient on the two datasets, respectively, which significantly\noutperforms various existing state-of-the-art baseline techniques such as\nnnUNet, nnDetection and LENS.\n","authors":["Ke Yan","Dakai Jin","Dazhou Guo","Minfeng Xu","Na Shen","Xian-Sheng Hua","Xianghua Ye","Le Lu"],"pdf_url":"https://arxiv.org/pdf/2307.15271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14710v3","updated":"2023-07-28T02:31:31Z","published":"2022-11-27T03:36:32Z","title":"3DPPE: 3D Point Positional Encoding for Multi-Camera 3D Object Detection\n  Transformers","summary":"  Transformer-based methods have swept the benchmarks on 2D and 3D detection on\nimages. Because tokenization before the attention mechanism drops the spatial\ninformation, positional encoding becomes critical for those methods. Recent\nworks found that encodings based on samples of the 3D viewing rays can\nsignificantly improve the quality of multi-camera 3D object detection. We\nhypothesize that 3D point locations can provide more information than rays.\nTherefore, we introduce 3D point positional encoding, 3DPPE, to the 3D\ndetection Transformer decoder. Although 3D measurements are not available at\nthe inference time of monocular 3D object detection, 3DPPE uses predicted depth\nto approximate the real point positions. Our hybriddepth module combines direct\nand categorical depth to estimate the refined depth of each pixel. Despite the\napproximation, 3DPPE achieves 46.0 mAP and 51.4 NDS on the competitive nuScenes\ndataset, significantly outperforming encodings based on ray samples. We make\nthe codes available at https://github.com/drilistbox/3DPPE.\n","authors":["Changyong Shu","JIajun Deng","Fisher Yu","Yifan Liu"],"pdf_url":"https://arxiv.org/pdf/2211.14710v3.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.15266v1","updated":"2023-07-28T02:23:35Z","published":"2023-07-28T02:23:35Z","title":"RSGPT: A Remote Sensing Vision Language Model and Benchmark","summary":"  The emergence of large-scale large language models, with GPT-4 as a prominent\nexample, has significantly propelled the rapid advancement of artificial\ngeneral intelligence and sparked the revolution of Artificial Intelligence 2.0.\nIn the realm of remote sensing (RS), there is a growing interest in developing\nlarge vision language models (VLMs) specifically tailored for data analysis in\nthis domain. However, current research predominantly revolves around visual\nrecognition tasks, lacking comprehensive, large-scale image-text datasets that\nare aligned and suitable for training large VLMs, which poses significant\nchallenges to effectively training such models for RS applications. In computer\nvision, recent research has demonstrated that fine-tuning large vision language\nmodels on small-scale, high-quality datasets can yield impressive performance\nin visual and language understanding. These results are comparable to\nstate-of-the-art VLMs trained from scratch on massive amounts of data, such as\nGPT-4. Inspired by this captivating idea, in this work, we build a high-quality\nRemote Sensing Image Captioning dataset (RSICap) that facilitates the\ndevelopment of large VLMs in the RS field. Unlike previous RS datasets that\neither employ model-generated captions or short descriptions, RSICap comprises\n2,585 human-annotated captions with rich and high-quality information. This\ndataset offers detailed descriptions for each image, encompassing scene\ndescriptions (e.g., residential area, airport, or farmland) as well as object\ninformation (e.g., color, shape, quantity, absolute position, etc). To\nfacilitate the evaluation of VLMs in the field of RS, we also provide a\nbenchmark evaluation dataset called RSIEval. This dataset consists of\nhuman-annotated captions and visual question-answer pairs, allowing for a\ncomprehensive assessment of VLMs in the context of RS.\n","authors":["Yuan Hu","Jianlong Yuan","Congcong Wen","Xiaonan Lu","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2307.15266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10830v3","updated":"2023-07-28T02:15:58Z","published":"2023-06-19T10:27:24Z","title":"3D VR Sketch Guided 3D Shape Prototyping and Exploration","summary":"  3D shape modeling is labor-intensive, time-consuming, and requires years of\nexpertise. To facilitate 3D shape modeling, we propose a 3D shape generation\nnetwork that takes a 3D VR sketch as a condition. We assume that sketches are\ncreated by novices without art training and aim to reconstruct geometrically\nrealistic 3D shapes of a given category. To handle potential sketch ambiguity,\nour method creates multiple 3D shapes that align with the original sketch's\nstructure. We carefully design our method, training the model step-by-step and\nleveraging multi-modal 3D shape representation to support training with limited\ntraining data. To guarantee the realism of generated 3D shapes we leverage the\nnormalizing flow that models the distribution of the latent space of 3D shapes.\nTo encourage the fidelity of the generated 3D shapes to an input sketch, we\npropose a dedicated loss that we deploy at different stages of the training\nprocess. The code is available at https://github.com/Rowl1ng/3Dsketch2shape.\n","authors":["Ling Luo","Pinaki Nath Chowdhury","Tao Xiang","Yi-Zhe Song","Yulia Gryaditskaya"],"pdf_url":"https://arxiv.org/pdf/2306.10830v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.01515v2","updated":"2023-07-28T02:05:08Z","published":"2023-07-04T06:54:01Z","title":"LPN: Language-guided Prototypical Network for few-shot classification","summary":"  Few-shot classification aims to adapt to new tasks with limited labeled\nexamples. To fully use the accessible data, recent methods explore suitable\nmeasures for the similarity between the query and support images and better\nhigh-dimensional features with meta-training and pre-training strategies.\nHowever, the potential of multi-modality information has barely been explored,\nwhich may bring promising improvement for few-shot classification. In this\npaper, we propose a Language-guided Prototypical Network (LPN) for few-shot\nclassification, which leverages the complementarity of vision and language\nmodalities via two parallel branches. Concretely, to introduce language\nmodality with limited samples in the visual task, we leverage a pre-trained\ntext encoder to extract class-level text features directly from class names\nwhile processing images with a conventional image encoder. Then, a\nlanguage-guided decoder is introduced to obtain text features corresponding to\neach image by aligning class-level features with visual features. In addition,\nto take advantage of class-level features and prototypes, we build a refined\nprototypical head that generates robust prototypes in the text branch for\nfollow-up measurement. Finally, we aggregate the visual and text logits to\ncalibrate the deviation of a single modality. Extensive experiments demonstrate\nthe competitiveness of LPN against state-of-the-art methods on benchmark\ndatasets.\n","authors":["Kaihui Cheng","Chule Yang"],"pdf_url":"https://arxiv.org/pdf/2307.01515v2.pdf","comment":"results error in table 1, the last line"},{"id":"http://arxiv.org/abs/2307.15257v1","updated":"2023-07-28T01:50:27Z","published":"2023-07-28T01:50:27Z","title":"Learning with Constraint Learning: New Perspective, Solution Strategy\n  and Various Applications","summary":"  The complexity of learning problems, such as Generative Adversarial Network\n(GAN) and its variants, multi-task and meta-learning, hyper-parameter learning,\nand a variety of real-world vision applications, demands a deeper understanding\nof their underlying coupling mechanisms. Existing approaches often address\nthese problems in isolation, lacking a unified perspective that can reveal\ncommonalities and enable effective solutions. Therefore, in this work, we\nproposed a new framework, named Learning with Constraint Learning (LwCL), that\ncan holistically examine challenges and provide a unified methodology to tackle\nall the above-mentioned complex learning and vision problems. Specifically,\nLwCL is designed as a general hierarchical optimization model that captures the\nessence of these diverse learning and vision problems. Furthermore, we develop\na gradient-response based fast solution strategy to overcome optimization\nchallenges of the LwCL framework. Our proposed framework efficiently addresses\na wide range of applications in learning and vision, encompassing three\ncategories and nine different problem types. Extensive experiments on synthetic\ntasks and real-world applications verify the effectiveness of our approach. The\nLwCL framework offers a comprehensive solution for tackling complex machine\nlearning and computer vision problems, bridging the gap between theory and\npractice.\n","authors":["Risheng Liu","Jiaxin Gao","Xuan Liu","Xin Fan"],"pdf_url":"https://arxiv.org/pdf/2307.15257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09763v2","updated":"2023-07-28T01:41:13Z","published":"2023-07-19T05:46:56Z","title":"Towards Building More Robust Models with Frequency Bias","summary":"  The vulnerability of deep neural networks to adversarial samples has been a\nmajor impediment to their broad applications, despite their success in various\nfields. Recently, some works suggested that adversarially-trained models\nemphasize the importance of low-frequency information to achieve higher\nrobustness. While several attempts have been made to leverage this frequency\ncharacteristic, they have all faced the issue that applying low-pass filters\ndirectly to input images leads to irreversible loss of discriminative\ninformation and poor generalizability to datasets with distinct frequency\nfeatures. This paper presents a plug-and-play module called the Frequency\nPreference Control Module that adaptively reconfigures the low- and\nhigh-frequency components of intermediate feature representations, providing\nbetter utilization of frequency in robust learning. Empirical studies show that\nour proposed module can be easily incorporated into any adversarial training\nframework, further improving model robustness across different architectures\nand datasets. Additionally, experiments were conducted to examine how the\nfrequency bias of robust models impacts the adversarial training process and\nits final robustness, revealing interesting insights.\n","authors":["Qingwen Bu","Dong Huang","Heming Cui"],"pdf_url":"https://arxiv.org/pdf/2307.09763v2.pdf","comment":"Accepted by ICCV23"},{"id":"http://arxiv.org/abs/2307.15254v1","updated":"2023-07-28T01:40:04Z","published":"2023-07-28T01:40:04Z","title":"Multiple Instance Learning Framework with Masked Hard Instance Mining\n  for Whole Slide Image Classification","summary":"  The whole slide image (WSI) classification is often formulated as a multiple\ninstance learning (MIL) problem. Since the positive tissue is only a small\nfraction of the gigapixel WSI,existing MIL methods intuitively focus on\nidentifying salient instances via attention mechanisms. However, this leads to\na bias towards easy-to-classify instances while neglecting hard-to-classify\ninstances.Some literature has revealed that hard examples are beneficial for\nmodeling a discriminative boundary accurately.By applying such an idea at the\ninstance level,we elaborate a novel MIL framework with masked hard instance\nmining (MHIM-MIL), which uses a Siamese structure (Teacher-Student) with a\nconsistency constraint to explore the potential hard instances. With several\ninstance masking strategies based on attention scores, MHIM-MIL employs a\nmomentum teacher to implicitly mine hard instances for training the student\nmodel, which can be any attention-based MIL model.This counter-intuitive\nstrategy essentially enables the student to learn a better discriminating\nboundary.Moreover, the student is used to update the teacher with an\nexponential moving average (EMA), which in turn identifies new hard instances\nfor subsequent training iterations and stabilizes the optimization.Experimental\nresults on the CAMELYON-16 and TCGA Lung Cancer datasets demonstrate that\nMHIM-MIL outperforms other latest methods in terms of performance and training\ncost. The code is available at:https://github.com/DearCaat/MHIM-MIL.\n","authors":["Wenhao Tang","Sheng Huang","Xiaoxian Zhang","Fengtao Zhou","Yi Zhang","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2307.15254v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.15252v1","updated":"2023-07-28T01:34:55Z","published":"2023-07-28T01:34:55Z","title":"A Solution to Co-occurrence Bias: Attributes Disentanglement via Mutual\n  Information Minimization for Pedestrian Attribute Recognition","summary":"  Recent studies on pedestrian attribute recognition progress with either\nexplicit or implicit modeling of the co-occurrence among attributes.\nConsidering that this known a prior is highly variable and unforeseeable\nregarding the specific scenarios, we show that current methods can actually\nsuffer in generalizing such fitted attributes interdependencies onto scenes or\nidentities off the dataset distribution, resulting in the underlined bias of\nattributes co-occurrence. To render models robust in realistic scenes, we\npropose the attributes-disentangled feature learning to ensure the recognition\nof an attribute not inferring on the existence of others, and which is\nsequentially formulated as a problem of mutual information minimization.\nRooting from it, practical strategies are devised to efficiently decouple\nattributes, which substantially improve the baseline and establish\nstate-of-the-art performance on realistic datasets like PETAzs and RAPzs. Code\nis released on\nhttps://github.com/SDret/A-Solution-to-Co-occurence-Bias-in-Pedestrian-Attribute-Recognition.\n","authors":["Yibo Zhou","Hai-Miao Hu","Jinzuo Yu","Zhenbo Xu","Weiqing Lu","Yuran Cao"],"pdf_url":"https://arxiv.org/pdf/2307.15252v1.pdf","comment":"Accepted in IJCAI23"},{"id":"http://arxiv.org/abs/2307.15250v1","updated":"2023-07-28T01:20:12Z","published":"2023-07-28T01:20:12Z","title":"D2S: Representing local descriptors and global scene coordinates for\n  camera relocalization","summary":"  State-of-the-art visual localization methods mostly rely on complex\nprocedures to match local descriptors and 3D point clouds. However, these\nprocedures can incur significant cost in terms of inference, storage, and\nupdates over time. In this study, we propose a direct learning-based approach\nthat utilizes a simple network named D2S to represent local descriptors and\ntheir scene coordinates. Our method is characterized by its simplicity and\ncost-effectiveness. It solely leverages a single RGB image for localization\nduring the testing phase and only requires a lightweight model to encode a\ncomplex sparse scene. The proposed D2S employs a combination of a simple loss\nfunction and graph attention to selectively focus on robust descriptors while\ndisregarding areas such as clouds, trees, and several dynamic objects. This\nselective attention enables D2S to effectively perform a binary-semantic\nclassification for sparse descriptors. Additionally, we propose a new outdoor\ndataset to evaluate the capabilities of visual localization methods in terms of\nscene generalization and self-updating from unlabeled observations. Our\napproach outperforms the state-of-the-art CNN-based methods in scene coordinate\nregression in indoor and outdoor environments. It demonstrates the ability to\ngeneralize beyond training data, including scenarios involving transitions from\nday to night and adapting to domain shifts, even in the absence of the labeled\ndata sources. The source code, trained models, dataset, and demo videos are\navailable at the following link: https://thpjp.github.io/d2s\n","authors":["Bach-Thuan Bui","Dinh-Tuan Tran","Joo-Ho Lee"],"pdf_url":"https://arxiv.org/pdf/2307.15250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15243v1","updated":"2023-07-28T00:44:55Z","published":"2023-07-28T00:44:55Z","title":"TROPHY: A Topologically Robust Physics-Informed Tracking Framework for\n  Tropical Cyclones","summary":"  Tropical cyclones (TCs) are among the most destructive weather systems.\nRealistically and efficiently detecting and tracking TCs are critical for\nassessing their impacts and risks. Recently, a multilevel robustness framework\nhas been introduced to study the critical points of time-varying vector fields.\nThe framework quantifies the robustness of critical points across varying\nneighborhoods. By relating the multilevel robustness with critical point\ntracking, the framework has demonstrated its potential in cyclone tracking. An\nadvantage is that it identifies cyclonic features using only 2D wind vector\nfields, which is encouraging as most tracking algorithms require multiple\ndynamic and thermodynamic variables at different altitudes. A disadvantage is\nthat the framework does not scale well computationally for datasets containing\na large number of cyclones. This paper introduces a topologically robust\nphysics-informed tracking framework (TROPHY) for TC tracking. The main idea is\nto integrate physical knowledge of TC to drastically improve the computational\nefficiency of multilevel robustness framework for large-scale climate datasets.\nFirst, during preprocessing, we propose a physics-informed feature selection\nstrategy to filter 90% of critical points that are short-lived and have low\nstability, thus preserving good candidates for TC tracking. Second, during\nin-processing, we impose constraints during the multilevel robustness\ncomputation to focus only on physics-informed neighborhoods of TCs. We apply\nTROPHY to 30 years of 2D wind fields from reanalysis data in ERA5 and generate\na number of TC tracks. In comparison with the observed tracks, we demonstrate\nthat TROPHY can capture TC characteristics that are comparable to and sometimes\neven better than a well-validated TC tracking algorithm that requires multiple\ndynamic and thermodynamic scalar fields.\n","authors":["Lin Yan","Hanqi Guo","Thomas Peterka","Bei Wang","Jiali Wang"],"pdf_url":"https://arxiv.org/pdf/2307.15243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10883v2","updated":"2023-07-28T00:39:28Z","published":"2023-05-18T11:25:23Z","title":"Domain Adaptive Sim-to-Real Segmentation of Oropharyngeal Organs","summary":"  Video-assisted transoral tracheal intubation (TI) necessitates using an\nendoscope that helps the physician insert a tracheal tube into the glottis\ninstead of the esophagus. The growing trend of robotic-assisted TI would\nrequire a medical robot to distinguish anatomical features like an experienced\nphysician which can be imitated by utilizing supervised deep-learning\ntechniques. However, the real datasets of oropharyngeal organs are often\ninaccessible due to limited open-source data and patient privacy. In this work,\nwe propose a domain adaptive Sim-to-Real framework called IoU-Ranking\nBlend-ArtFlow (IRB-AF) for image segmentation of oropharyngeal organs. The\nframework includes an image blending strategy called IoU-Ranking Blend (IRB)\nand style-transfer method ArtFlow. Here, IRB alleviates the problem of poor\nsegmentation performance caused by significant datasets domain differences;\nwhile ArtFlow is introduced to reduce the discrepancies between datasets\nfurther. A virtual oropharynx image dataset generated by the SOFA framework is\nused as the learning subject for semantic segmentation to deal with the limited\navailability of actual endoscopic images. We adapted IRB-AF with the\nstate-of-the-art domain adaptive segmentation models. The results demonstrate\nthe superior performance of our approach in further improving the segmentation\naccuracy and training stability.\n","authors":["Guankun Wang","Tian-Ao Ren","Jiewen Lai","Long Bai","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2305.10883v2.pdf","comment":"The manuscript is accepted by Medical & Biological Engineering &\n  Computing. Code and dataset:\n  https://github.com/gkw0010/EISOST-Sim2Real-Dataset-Release"},{"id":"http://arxiv.org/abs/2306.01874v2","updated":"2023-07-28T00:32:09Z","published":"2023-06-02T19:07:52Z","title":"SACSoN: Scalable Autonomous Control for Social Navigation","summary":"  Machine learning provides a powerful tool for building socially compliant\nrobotic systems that go beyond simple predictive models of human behavior. By\nobserving and understanding human interactions from past experiences, learning\ncan enable effective social navigation behaviors directly from data. In this\npaper, our goal is to develop methods for training policies for socially\nunobtrusive navigation, such that robots can navigate among humans in ways that\ndon't disturb human behavior. We introduce a definition for such behavior based\non the counterfactual perturbation of the human: if the robot had not intruded\ninto the space, would the human have acted in the same way? By minimizing this\ncounterfactual perturbation, we can induce robots to behave in ways that do not\nalter the natural behavior of humans in the shared space. Instantiating this\nprinciple requires training policies to minimize their effect on human\nbehavior, and this in turn requires data that allows us to model the behavior\nof humans in the presence of robots. Therefore, our approach is based on two\nkey contributions. First, we collect a large dataset where an indoor mobile\nrobot interacts with human bystanders. Second, we utilize this dataset to train\npolicies that minimize counterfactual perturbation. We provide supplementary\nvideos and make publicly available the largest-of-its-kind visual navigation\ndataset on our project page.\n","authors":["Noriaki Hirose","Dhruv Shah","Ajay Sridhar","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2306.01874v2.pdf","comment":"10 pages, 14 figures, 4 tables"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2305.11164v2","updated":"2023-07-28T13:29:58Z","published":"2023-05-18T17:52:58Z","title":"Exploring the Carbon Footprint of Hugging Face's ML Models: A Repository\n  Mining Study","summary":"  The rise of machine learning (ML) systems has exacerbated their carbon\nfootprint due to increased capabilities and model sizes. However, there is\nscarce knowledge on how the carbon footprint of ML models is actually measured,\nreported, and evaluated. In light of this, the paper aims to analyze the\nmeasurement of the carbon footprint of 1,417 ML models and associated datasets\non Hugging Face, which is the most popular repository for pretrained ML models.\nThe goal is to provide insights and recommendations on how to report and\noptimize the carbon efficiency of ML models. The study includes the first\nrepository mining study on the Hugging Face Hub API on carbon emissions. This\nstudy seeks to answer two research questions: (1) how do ML model creators\nmeasure and report carbon emissions on Hugging Face Hub?, and (2) what aspects\nimpact the carbon emissions of training ML models? The study yielded several\nkey findings. These include a stalled proportion of carbon emissions-reporting\nmodels, a slight decrease in reported carbon footprint on Hugging Face over the\npast 2 years, and a continued dominance of NLP as the main application domain.\nFurthermore, the study uncovers correlations between carbon emissions and\nvarious attributes such as model size, dataset size, and ML application\ndomains. These results highlight the need for software measurements to improve\nenergy reporting practices and promote carbon-efficient model development\nwithin the Hugging Face community. In response to this issue, two\nclassifications are proposed: one for categorizing models based on their carbon\nemission reporting practices and another for their carbon efficiency. The aim\nof these classification proposals is to foster transparency and sustainable\nmodel development within the ML community.\n","authors":["Joel Castaño","Silverio Martínez-Fernández","Xavier Franch","Justus Bogner"],"pdf_url":"https://arxiv.org/pdf/2305.11164v2.pdf","comment":"Accepted at the 2023 ACM/IEEE International Symposium on Empirical\n  Software Engineering and Measurement (ESEM)"},{"id":"http://arxiv.org/abs/2211.05290v3","updated":"2023-07-28T12:04:30Z","published":"2022-11-10T01:53:29Z","title":"Equivariant Contrastive Learning for Sequential Recommendation","summary":"  Contrastive learning (CL) benefits the training of sequential recommendation\nmodels with informative self-supervision signals. Existing solutions apply\ngeneral sequential data augmentation strategies to generate positive pairs and\nencourage their representations to be invariant. However, due to the inherent\nproperties of user behavior sequences, some augmentation strategies, such as\nitem substitution, can lead to changes in user intent. Learning\nindiscriminately invariant representations for all augmentation strategies\nmight be suboptimal. Therefore, we propose Equivariant Contrastive Learning for\nSequential Recommendation (ECL-SR), which endows SR models with great\ndiscriminative power, making the learned user behavior representations\nsensitive to invasive augmentations (e.g., item substitution) and insensitive\nto mild augmentations (e.g., featurelevel dropout masking). In detail, we use\nthe conditional discriminator to capture differences in behavior due to item\nsubstitution, which encourages the user behavior encoder to be equivariant to\ninvasive augmentations. Comprehensive experiments on four benchmark datasets\nshow that the proposed ECL-SR framework achieves competitive performance\ncompared to state-of-the-art SR models. The source code is available at\nhttps://github.com/Tokkiu/ECL.\n","authors":["Peilin Zhou","Jingqi Gao","Yueqi Xie","Qichen Ye","Yining Hua","Jae Boum Kim","Shoujin Wang","Sunghun Kim"],"pdf_url":"https://arxiv.org/pdf/2211.05290v3.pdf","comment":"Accepted by RecSys 2023"},{"id":"http://arxiv.org/abs/2307.15464v1","updated":"2023-07-28T10:34:47Z","published":"2023-07-28T10:34:47Z","title":"Framework to Automatically Determine the Quality of Open Data Catalogs","summary":"  Data catalogs play a crucial role in modern data-driven organizations by\nfacilitating the discovery, understanding, and utilization of diverse data\nassets. However, ensuring their quality and reliability is complex, especially\nin open and large-scale data environments. This paper proposes a framework to\nautomatically determine the quality of open data catalogs, addressing the need\nfor efficient and reliable quality assessment mechanisms. Our framework can\nanalyze various core quality dimensions, such as accuracy, completeness,\nconsistency, scalability, and timeliness, offer several alternatives for the\nassessment of compatibility and similarity across such catalogs as well as the\nimplementation of a set of non-core quality dimensions such as provenance,\nreadability, and licensing. The goal is to empower data-driven organizations to\nmake informed decisions based on trustworthy and well-curated data assets. The\nsource code that illustrates our approach can be downloaded from\nhttps://www.github.com/jorge-martinez-gil/dataq/.\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.15464v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2307.15367v1","updated":"2023-07-28T07:34:44Z","published":"2023-07-28T07:34:44Z","title":"Toward Transparent Sequence Models with Model-Based Tree Markov Model","summary":"  In this study, we address the interpretability issue in complex, black-box\nMachine Learning models applied to sequence data. We introduce the Model-Based\ntree Hidden Semi-Markov Model (MOB-HSMM), an inherently interpretable model\naimed at detecting high mortality risk events and discovering hidden patterns\nassociated with the mortality risk in Intensive Care Units (ICU). This model\nleverages knowledge distilled from Deep Neural Networks (DNN) to enhance\npredictive performance while offering clear explanations. Our experimental\nresults indicate the improved performance of Model-Based trees (MOB trees) via\nemploying LSTM for learning sequential patterns, which are then transferred to\nMOB trees. Integrating MOB trees with the Hidden Semi-Markov Model (HSMM) in\nthe MOB-HSMM enables uncovering potential and explainable sequences using\navailable information.\n","authors":["Chan Hsu","Wei-Chun Huang","Jun-Ting Wu","Chih-Yuan Li","Yihuang Kang"],"pdf_url":"https://arxiv.org/pdf/2307.15367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15326v1","updated":"2023-07-28T06:04:46Z","published":"2023-07-28T06:04:46Z","title":"Staging E-Commerce Products for Online Advertising using Retrieval\n  Assisted Image Generation","summary":"  Online ads showing e-commerce products typically rely on the product images\nin a catalog sent to the advertising platform by an e-commerce platform. In the\nbroader ads industry such ads are called dynamic product ads (DPA). It is\ncommon for DPA catalogs to be in the scale of millions (corresponding to the\nscale of products which can be bought from the e-commerce platform). However,\nnot all product images in the catalog may be appealing when directly\nre-purposed as an ad image, and this may lead to lower click-through rates\n(CTRs). In particular, products just placed against a solid background may not\nbe as enticing and realistic as a product staged in a natural environment. To\naddress such shortcomings of DPA images at scale, we propose a generative\nadversarial network (GAN) based approach to generate staged backgrounds for\nun-staged product images. Generating the entire staged background is a\nchallenging task susceptible to hallucinations. To get around this, we\nintroduce a simpler approach called copy-paste staging using retrieval assisted\nGANs. In copy paste staging, we first retrieve (from the catalog) staged\nproducts similar to the un-staged input product, and then copy-paste the\nbackground of the retrieved product in the input image. A GAN based in-painting\nmodel is used to fill the holes left after this copy-paste operation. We show\nthe efficacy of our copy-paste staging method via offline metrics, and human\nevaluation. In addition, we show how our staging approach can enable animations\nof moving products leading to a video ad from a product image.\n","authors":["Yueh-Ning Ku","Mikhail Kuznetsov","Shaunak Mishra","Paloma de Juan"],"pdf_url":"https://arxiv.org/pdf/2307.15326v1.pdf","comment":"Accepted for publication in AdKDD 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.15712v1","updated":"2023-07-28T17:59:46Z","published":"2023-07-28T17:59:46Z","title":"Quantum-noise-limited optical neural networks operating at a few quanta\n  per activation","summary":"  Analog physical neural networks, which hold promise for improved energy\nefficiency and speed compared to digital electronic neural networks, are\nnevertheless typically operated in a relatively high-power regime so that the\nsignal-to-noise ratio (SNR) is large (>10). What happens if an analog system is\ninstead operated in an ultra-low-power regime, in which the behavior of the\nsystem becomes highly stochastic and the noise is no longer a small\nperturbation on the signal? In this paper, we study this question in the\nsetting of optical neural networks operated in the limit where some layers use\nonly a single photon to cause a neuron activation. Neuron activations in this\nlimit are dominated by quantum noise from the fundamentally probabilistic\nnature of single-photon detection of weak optical signals. We show that it is\npossible to train stochastic optical neural networks to perform deterministic\nimage-classification tasks with high accuracy in spite of the extremely high\nnoise (SNR ~ 1) by using a training procedure that directly models the\nstochastic behavior of photodetection. We experimentally demonstrated MNIST\nclassification with a test accuracy of 98% using an optical neural network with\na hidden layer operating in the single-photon regime; the optical energy used\nto perform the classification corresponds to 0.008 photons per\nmultiply-accumulate (MAC) operation, which is equivalent to 0.003 attojoules of\noptical energy per MAC. Our experiment used >40x fewer photons per inference\nthan previous state-of-the-art low-optical-energy demonstrations, to achieve\nthe same accuracy of >90%. Our work shows that some extremely stochastic analog\nsystems, including those operating in the limit where quantum noise dominates,\ncan nevertheless be used as layers in neural networks that deterministically\nperform classification tasks with high accuracy if they are appropriately\ntrained.\n","authors":["Shi-Yuan Ma","Tianyu Wang","Jérémie Laydevant","Logan G. Wright","Peter L. McMahon"],"pdf_url":"https://arxiv.org/pdf/2307.15712v1.pdf","comment":"55 pages, 27 figures"},{"id":"http://arxiv.org/abs/2307.15710v1","updated":"2023-07-28T17:59:03Z","published":"2023-07-28T17:59:03Z","title":"Semi-Supervised Object Detection in the Open World","summary":"  Existing approaches for semi-supervised object detection assume a fixed set\nof classes present in training and unlabeled datasets, i.e., in-distribution\n(ID) data. The performance of these techniques significantly degrades when\nthese techniques are deployed in the open-world, due to the fact that the\nunlabeled and test data may contain objects that were not seen during training,\ni.e., out-of-distribution (OOD) data. The two key questions that we explore in\nthis paper are: can we detect these OOD samples and if so, can we learn from\nthem? With these considerations in mind, we propose the Open World\nSemi-supervised Detection framework (OWSSD) that effectively detects OOD data\nalong with a semi-supervised learning pipeline that learns from both ID and OOD\ndata. We introduce an ensemble based OOD detector consisting of lightweight\nauto-encoder networks trained only on ID data. Through extensive evalulation,\nwe demonstrate that our method performs competitively against state-of-the-art\nOOD detection algorithms and also significantly boosts the semi-supervised\nlearning performance in open-world scenarios.\n","authors":["Garvita Allabadi","Ana Lucic","Peter Pao-Huang","Yu-Xiong Wang","Vikram Adve"],"pdf_url":"https://arxiv.org/pdf/2307.15710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15703v1","updated":"2023-07-28T17:51:21Z","published":"2023-07-28T17:51:21Z","title":"Uncertainty in Natural Language Generation: From Theory to Applications","summary":"  Recent advances of powerful Language Models have allowed Natural Language\nGeneration (NLG) to emerge as an important technology that can not only perform\ntraditional tasks like summarisation or translation, but also serve as a\nnatural language interface to a variety of applications. As such, it is crucial\nthat NLG systems are trustworthy and reliable, for example by indicating when\nthey are likely to be wrong; and supporting multiple views, backgrounds and\nwriting styles -- reflecting diverse human sub-populations. In this paper, we\nargue that a principled treatment of uncertainty can assist in creating systems\nand evaluation protocols better aligned with these goals. We first present the\nfundamental theory, frameworks and vocabulary required to represent\nuncertainty. We then characterise the main sources of uncertainty in NLG from a\nlinguistic perspective, and propose a two-dimensional taxonomy that is more\ninformative and faithful than the popular aleatoric/epistemic dichotomy.\nFinally, we move from theory to applications and highlight exciting research\ndirections that exploit uncertainty to power decoding, controllable generation,\nself-assessment, selective answering, active learning and more.\n","authors":["Joris Baan","Nico Daheim","Evgenia Ilia","Dennis Ulmer","Haau-Sing Li","Raquel Fernández","Barbara Plank","Rico Sennrich","Chrysoula Zerva","Wilker Aziz"],"pdf_url":"https://arxiv.org/pdf/2307.15703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09394v2","updated":"2023-07-28T17:41:16Z","published":"2023-02-18T18:00:05Z","title":"Deep Neural Networks based Meta-Learning for Network Intrusion Detection","summary":"  The digitization of different components of industry and inter-connectivity\namong indigenous networks have increased the risk of network attacks. Designing\nan intrusion detection system to ensure security of the industrial ecosystem is\ndifficult as network traffic encompasses various attack types, including new\nand evolving ones with minor changes. The data used to construct a predictive\nmodel for computer networks has a skewed class distribution and limited\nrepresentation of attack types, which differ from real network traffic. These\nlimitations result in dataset shift, negatively impacting the machine learning\nmodels' predictive abilities and reducing the detection rate against novel\nattacks. To address the challenges, we propose a novel deep neural network\nbased Meta-Learning framework; INformation FUsion and Stacking Ensemble\n(INFUSE) for network intrusion detection. First, a hybrid feature space is\ncreated by integrating decision and feature spaces. Five different classifiers\nare utilized to generate a pool of decision spaces. The feature space is then\nenriched through a deep sparse autoencoder that learns the semantic\nrelationships between attacks. Finally, the deep Meta-Learner acts as an\nensemble combiner to analyze the hybrid feature space and make a final\ndecision. Our evaluation on stringent benchmark datasets and comparison to\nexisting techniques showed the effectiveness of INFUSE with an F-Score of 0.91,\nAccuracy of 91.6%, and Recall of 0.94 on the Test+ dataset, and an F-Score of\n0.91, Accuracy of 85.6%, and Recall of 0.87 on the stringent Test-21 dataset.\nThese promising results indicate the strong generalization capability and the\npotential to detect network attacks.\n","authors":["Anabia Sohail","Bibi Ayisha","Irfan Hameed","Muhammad Mohsin Zafar","Hani Alquhayz","Asifullah Khan"],"pdf_url":"https://arxiv.org/pdf/2302.09394v2.pdf","comment":"Pages: 15, Figures: 10 and Tables: 9"},{"id":"http://arxiv.org/abs/2307.15694v1","updated":"2023-07-28T17:40:58Z","published":"2023-07-28T17:40:58Z","title":"Universal Recurrent Event Memories for Streaming Data","summary":"  In this paper, we propose a new event memory architecture (MemNet) for\nrecurrent neural networks, which is universal for different types of time\nseries data such as scalar, multivariate or symbolic. Unlike other external\nneural memory architectures, it stores key-value pairs, which separate the\ninformation for addressing and for content to improve the representation, as in\nthe digital archetype. Moreover, the key-value pairs also avoid the compromise\nbetween memory depth and resolution that applies to memories constructed by the\nmodel state. One of the MemNet key characteristics is that it requires only\nlinear adaptive mapping functions while implementing a nonlinear operation on\nthe input data. MemNet architecture can be applied without modifications to\nscalar time series, logic operators on strings, and also to natural language\nprocessing, providing state-of-the-art results in all application domains such\nas the chaotic time series, the symbolic operation tasks, and the\nquestion-answering tasks (bAbI). Finally, controlled by five linear layers,\nMemNet requires a much smaller number of training parameters than other\nexternal memory networks as well as the transformer network. The space\ncomplexity of MemNet equals a single self-attention layer. It greatly improves\nthe efficiency of the attention mechanism and opens the door for IoT\napplications.\n","authors":["Ran Dou","Jose Principe"],"pdf_url":"https://arxiv.org/pdf/2307.15694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15691v1","updated":"2023-07-28T17:37:47Z","published":"2023-07-28T17:37:47Z","title":"ODTlearn: A Package for Learning Optimal Decision Trees for Prediction\n  and Prescription","summary":"  ODTLearn is an open-source Python package that provides methods for learning\noptimal decision trees for high-stakes predictive and prescriptive tasks based\non the mixed-integer optimization (MIO) framework proposed in Aghaei et al.\n(2019) and several of its extensions. The current version of the package\nprovides implementations for learning optimal classification trees, optimal\nfair classification trees, optimal classification trees robust to distribution\nshifts, and optimal prescriptive trees from observational data. We have\ndesigned the package to be easy to maintain and extend as new optimal decision\ntree problem classes, reformulation strategies, and solution algorithms are\nintroduced. To this end, the package follows object-oriented design principles\nand supports both commercial (Gurobi) and open source (COIN-OR branch and cut)\nsolvers. The package documentation and an extensive user guide can be found at\nhttps://d3m-research-group.github.io/odtlearn/. Additionally, users can view\nthe package source code and submit feature requests and bug reports by visiting\nhttps://github.com/D3M-Research-Group/odtlearn.\n","authors":["Patrick Vossler","Sina Aghaei","Nathan Justin","Nathanael Jo","Andrés Gómez","Phebe Vayanos"],"pdf_url":"https://arxiv.org/pdf/2307.15691v1.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.15690v1","updated":"2023-07-28T17:29:49Z","published":"2023-07-28T17:29:49Z","title":"Benchmarking Offline Reinforcement Learning on Real-Robot Hardware","summary":"  Learning policies from previously recorded data is a promising direction for\nreal-world robotics tasks, as online learning is often infeasible. Dexterous\nmanipulation in particular remains an open problem in its general form. The\ncombination of offline reinforcement learning with large diverse datasets,\nhowever, has the potential to lead to a breakthrough in this challenging domain\nanalogously to the rapid progress made in supervised learning in recent years.\nTo coordinate the efforts of the research community toward tackling this\nproblem, we propose a benchmark including: i) a large collection of data for\noffline learning from a dexterous manipulation platform on two tasks, obtained\nwith capable RL agents trained in simulation; ii) the option to execute learned\npolicies on a real-world robotic system and a simulation for efficient\ndebugging. We evaluate prominent open-sourced offline reinforcement learning\nalgorithms on the datasets and provide a reproducible experimental setup for\noffline reinforcement learning on real systems.\n","authors":["Nico Gürtler","Sebastian Blaes","Pavel Kolev","Felix Widmaier","Manuel Wüthrich","Stefan Bauer","Bernhard Schölkopf","Georg Martius"],"pdf_url":"https://arxiv.org/pdf/2307.15690v1.pdf","comment":"The Eleventh International Conference on Learning Representations.\n  2022. Published at ICLR 2023. Datasets available at\n  https://github.com/rr-learning/trifinger_rl_datasets"},{"id":"http://arxiv.org/abs/2307.15682v1","updated":"2023-07-28T17:16:58Z","published":"2023-07-28T17:16:58Z","title":"A supervised hybrid quantum machine learning solution to the emergency\n  escape routing problem","summary":"  Managing the response to natural disasters effectively can considerably\nmitigate their devastating impact. This work explores the potential of using\nsupervised hybrid quantum machine learning to optimize emergency evacuation\nplans for cars during natural disasters. The study focuses on earthquake\nemergencies and models the problem as a dynamic computational graph where an\nearthquake damages an area of a city. The residents seek to evacuate the city\nby reaching the exit points where traffic congestion occurs. The situation is\nmodeled as a shortest-path problem on an uncertain and dynamically evolving\nmap. We propose a novel hybrid supervised learning approach and test it on\nhypothetical situations on a concrete city graph. This approach uses a novel\nquantum feature-wise linear modulation (FiLM) neural network parallel to a\nclassical FiLM network to imitate Dijkstra's node-wise shortest path algorithm\non a deterministic dynamic graph. Adding the quantum neural network in parallel\nincreases the overall model's expressivity by splitting the dataset's harmonic\nand non-harmonic features between the quantum and classical components. The\nhybrid supervised learning agent is trained on a dataset of Dijkstra's shortest\npaths and can successfully learn the navigation task. The hybrid quantum\nnetwork improves over the purely classical supervised learning approach by 7%\nin accuracy. We show that the quantum part has a significant contribution of\n45.(3)% to the prediction and that the network could be executed on an\nion-based quantum computer. The results demonstrate the potential of supervised\nhybrid quantum machine learning in improving emergency evacuation planning\nduring natural disasters.\n","authors":["Nathan Haboury","Mo Kordzanganeh","Sebastian Schmitt","Ayush Joshi","Igor Tokarev","Lukas Abdallah","Andrii Kurkin","Basil Kyriacou","Alexey Melnikov"],"pdf_url":"https://arxiv.org/pdf/2307.15682v1.pdf","comment":"15 pages, 9 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.15679v1","updated":"2023-07-28T17:14:58Z","published":"2023-07-28T17:14:58Z","title":"Dynamic Analysis and an Eigen Initializer for Recurrent Neural Networks","summary":"  In recurrent neural networks, learning long-term dependency is the main\ndifficulty due to the vanishing and exploding gradient problem. Many\nresearchers are dedicated to solving this issue and they proposed many\nalgorithms. Although these algorithms have achieved great success,\nunderstanding how the information decays remains an open problem. In this\npaper, we study the dynamics of the hidden state in recurrent neural networks.\nWe propose a new perspective to analyze the hidden state space based on an\neigen decomposition of the weight matrix. We start the analysis by linear state\nspace model and explain the function of preserving information in activation\nfunctions. We provide an explanation for long-term dependency based on the\neigen analysis. We also point out the different behavior of eigenvalues for\nregression tasks and classification tasks. From the observations on\nwell-trained recurrent neural networks, we proposed a new initialization method\nfor recurrent neural networks, which improves consistently performance. It can\nbe applied to vanilla-RNN, LSTM, and GRU. We test on many datasets, such as\nTomita Grammars, pixel-by-pixel MNIST datasets, and machine translation\ndatasets (Multi30k). It outperforms the Xavier initializer and kaiming\ninitializer as well as other RNN-only initializers like IRNN and sp-RNN in\nseveral tasks.\n","authors":["Ran Dou","Jose Principe"],"pdf_url":"https://arxiv.org/pdf/2307.15679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15678v1","updated":"2023-07-28T17:13:00Z","published":"2023-07-28T17:13:00Z","title":"Case Studies of Causal Discovery from IT Monitoring Time Series","summary":"  Information technology (IT) systems are vital for modern businesses, handling\ndata storage, communication, and process automation. Monitoring these systems\nis crucial for their proper functioning and efficiency, as it allows collecting\nextensive observational time series data for analysis. The interest in causal\ndiscovery is growing in IT monitoring systems as knowing causal relations\nbetween different components of the IT system helps in reducing downtime,\nenhancing system performance and identifying root causes of anomalies and\nincidents. It also allows proactive prediction of future issues through\nhistorical data analysis. Despite its potential benefits, applying causal\ndiscovery algorithms on IT monitoring data poses challenges, due to the\ncomplexity of the data. For instance, IT monitoring data often contains\nmisaligned time series, sleeping time series, timestamp errors and missing\nvalues. This paper presents case studies on applying causal discovery\nalgorithms to different IT monitoring datasets, highlighting benefits and\nongoing challenges.\n","authors":["Ali Aït-Bachir","Charles K. Assaad","Christophe de Bignicourt","Emilie Devijver","Simon Ferreira","Eric Gaussier","Hosein Mohanna","Lei Zan"],"pdf_url":"https://arxiv.org/pdf/2307.15678v1.pdf","comment":"Accepted to the UAI 2023 Workshop on The History and Development of\n  Search Methods for Causal Structure"},{"id":"http://arxiv.org/abs/2307.15677v1","updated":"2023-07-28T17:12:46Z","published":"2023-07-28T17:12:46Z","title":"Adversarial training for tabular data with attack propagation","summary":"  Adversarial attacks are a major concern in security-centered applications,\nwhere malicious actors continuously try to mislead Machine Learning (ML) models\ninto wrongly classifying fraudulent activity as legitimate, whereas system\nmaintainers try to stop them. Adversarially training ML models that are robust\nagainst such attacks can prevent business losses and reduce the work load of\nsystem maintainers. In such applications data is often tabular and the space\navailable for attackers to manipulate undergoes complex feature engineering\ntransformations, to provide useful signals for model training, to a space\nattackers cannot access. Thus, we propose a new form of adversarial training\nwhere attacks are propagated between the two spaces in the training loop. We\nthen test this method empirically on a real world dataset in the domain of\ncredit card fraud detection. We show that our method can prevent about 30%\nperformance drops under moderate attacks and is essential under very aggressive\nattacks, with a trade-off loss in performance under no attacks smaller than 7%.\n","authors":["Tiago Leon Melo","João Bravo","Marco O. P. Sampaio","Paolo Romano","Hugo Ferreira","João Tiago Ascensão","Pedro Bizarro"],"pdf_url":"https://arxiv.org/pdf/2307.15677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15672v1","updated":"2023-07-28T17:04:06Z","published":"2023-07-28T17:04:06Z","title":"Bayesian Time-Series Classifier for Decoding Simple Visual Stimuli from\n  Intracranial Neural Activity","summary":"  Understanding how external stimuli are encoded in distributed neural activity\nis of significant interest in clinical and basic neuroscience. To address this\nneed, it is essential to develop analytical tools capable of handling limited\ndata and the intrinsic stochasticity present in neural data. In this study, we\npropose a straightforward Bayesian time series classifier (BTsC) model that\ntackles these challenges whilst maintaining a high level of interpretability.\nWe demonstrate the classification capabilities of this approach by utilizing\nneural data to decode colors in a visual task. The model exhibits consistent\nand reliable average performance of 75.55% on 4 patients' dataset, improving\nupon state-of-the-art machine learning techniques by about 3.0 percent. In\naddition to its high classification accuracy, the proposed BTsC model provides\ninterpretable results, making the technique a valuable tool to study neural\nactivity in various tasks and categories. The proposed solution can be applied\nto neural data recorded in various tasks, where there is a need for\ninterpretable results and accurate classification accuracy.\n","authors":["Navid Ziaei","Reza Saadatifard","Ali Yousefi","Behzad Nazari","Sydney S. Cash","Angelique C. Paulk"],"pdf_url":"https://arxiv.org/pdf/2307.15672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15663v1","updated":"2023-07-28T16:48:42Z","published":"2023-07-28T16:48:42Z","title":"CoRe Optimizer: An All-in-One Solution for Machine Learning","summary":"  The optimization algorithm and its hyperparameters can significantly affect\nthe training speed and resulting model accuracy in machine learning\napplications. The wish list for an ideal optimizer includes fast and smooth\nconvergence to low error, low computational demand, and general applicability.\nOur recently introduced continual resilient (CoRe) optimizer has shown superior\nperformance compared to other state-of-the-art first-order gradient-based\noptimizers for training lifelong machine learning potentials. In this work we\nprovide an extensive performance comparison of the CoRe optimizer and nine\nother optimization algorithms including the Adam optimizer and resilient\nbackpropagation (RPROP) for diverse machine learning tasks. We analyze the\ninfluence of different hyperparameters and provide generally applicable values.\nThe CoRe optimizer yields best or competitive performance in every investigated\napplication, while only one hyperparameter needs to be changed depending on\nmini-batch or batch learning.\n","authors":["Marco Eckhoff","Markus Reiher"],"pdf_url":"https://arxiv.org/pdf/2307.15663v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.05966v2","updated":"2023-07-28T16:33:57Z","published":"2023-05-10T08:18:10Z","title":"Graph Neural Networks and 3-Dimensional Topology","summary":"  We test the efficiency of applying Geometric Deep Learning to the problems in\nlow-dimensional topology in a certain simple setting. Specifically, we consider\nthe class of 3-manifolds described by plumbing graphs and use Graph Neural\nNetworks (GNN) for the problem of deciding whether a pair of graphs give\nhomeomorphic 3-manifolds. We use supervised learning to train a GNN that\nprovides the answer to such a question with high accuracy. Moreover, we\nconsider reinforcement learning by a GNN to find a sequence of Neumann moves\nthat relates the pair of graphs if the answer is positive. The setting can be\nunderstood as a toy model of the problem of deciding whether a pair of Kirby\ndiagrams give diffeomorphic 3- or 4-manifolds.\n","authors":["Pavel Putrov","Song Jin Ri"],"pdf_url":"https://arxiv.org/pdf/2305.05966v2.pdf","comment":"12 pages and appendix, 9 figures"},{"id":"http://arxiv.org/abs/2307.15647v1","updated":"2023-07-28T16:08:10Z","published":"2023-07-28T16:08:10Z","title":"Multi-layer Aggregation as a key to feature-based OOD detection","summary":"  Deep Learning models are easily disturbed by variations in the input images\nthat were not observed during the training stage, resulting in unpredictable\npredictions. Detecting such Out-of-Distribution (OOD) images is particularly\ncrucial in the context of medical image analysis, where the range of possible\nabnormalities is extremely wide. Recently, a new category of methods has\nemerged, based on the analysis of the intermediate features of a trained model.\nThese methods can be divided into 2 groups: single-layer methods that consider\nthe feature map obtained at a fixed, carefully chosen layer, and multi-layer\nmethods that consider the ensemble of the feature maps generated by the model.\nWhile promising, a proper comparison of these algorithms is still lacking. In\nthis work, we compared various feature-based OOD detection methods on a large\nspectra of OOD (20 types), representing approximately 7800 3D MRIs. Our\nexperiments shed the light on two phenomenons. First, multi-layer methods\nconsistently outperform single-layer approaches, which tend to have\ninconsistent behaviour depending on the type of anomaly. Second, the OOD\ndetection performance highly depends on the architecture of the underlying\nneural network.\n","authors":["Benjamin Lambert","Florence Forbes","Senan Doyle","Michel Dojat"],"pdf_url":"https://arxiv.org/pdf/2307.15647v1.pdf","comment":"Accepted for presentation at the Workshop on Uncertainty for Safe\n  Utilization of Machine Learning in Medical Imaging (UNSURE) at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.15645v1","updated":"2023-07-28T16:04:34Z","published":"2023-07-28T16:04:34Z","title":"Scale-aware Test-time Click Adaptation for Pulmonary Nodule and Mass\n  Segmentation","summary":"  Pulmonary nodules and masses are crucial imaging features in lung cancer\nscreening that require careful management in clinical diagnosis. Despite the\nsuccess of deep learning-based medical image segmentation, the robust\nperformance on various sizes of lesions of nodule and mass is still\nchallenging. In this paper, we propose a multi-scale neural network with\nscale-aware test-time adaptation to address this challenge. Specifically, we\nintroduce an adaptive Scale-aware Test-time Click Adaptation method based on\neffortlessly obtainable lesion clicks as test-time cues to enhance segmentation\nperformance, particularly for large lesions. The proposed method can be\nseamlessly integrated into existing networks. Extensive experiments on both\nopen-source and in-house datasets consistently demonstrate the effectiveness of\nthe proposed method over some CNN and Transformer-based segmentation methods.\nOur code is available at https://github.com/SplinterLi/SaTTCA\n","authors":["Zhihao Li","Jiancheng Yang","Yongchao Xu","Li Zhang","Wenhui Dong","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2307.15645v1.pdf","comment":"11 pages, 3 figures, MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.15644v1","updated":"2023-07-28T16:03:28Z","published":"2023-07-28T16:03:28Z","title":"Scaling Data Generation in Vision-and-Language Navigation","summary":"  Recent research in language-guided visual navigation has demonstrated a\nsignificant demand for the diversity of traversable environments and the\nquantity of supervision for training generalizable agents. To tackle the common\ndata scarcity issue in existing vision-and-language navigation datasets, we\npropose an effective paradigm for generating large-scale data for learning,\nwhich applies 1200+ photo-realistic environments from HM3D and Gibson datasets\nand synthesizes 4.9 million instruction trajectory pairs using fully-accessible\nresources on the web. Importantly, we investigate the influence of each\ncomponent in this paradigm on the agent's performance and study how to\nadequately apply the augmented data to pre-train and fine-tune an agent. Thanks\nto our large-scale dataset, the performance of an existing agent can be pushed\nup (+11% absolute with regard to previous SoTA) to a significantly new best of\n80% single-run success rate on the R2R test split by simple imitation learning.\nThe long-lasting generalization gap between navigating in seen and unseen\nenvironments is also reduced to less than 1% (versus 8% in the previous best\nmethod). Moreover, our paradigm also facilitates different models to achieve\nnew state-of-the-art navigation results on CVDN, REVERIE, and R2R in continuous\nenvironments.\n","authors":["Zun Wang","Jialu Li","Yicong Hong","Yi Wang","Qi Wu","Mohit Bansal","Stephen Gould","Hao Tan","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2307.15644v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.15638v1","updated":"2023-07-28T15:56:04Z","published":"2023-07-28T15:56:04Z","title":"TriadNet: Sampling-free predictive intervals for lesional volume in 3D\n  brain MR images","summary":"  The volume of a brain lesion (e.g. infarct or tumor) is a powerful indicator\nof patient prognosis and can be used to guide the therapeutic strategy.\nLesional volume estimation is usually performed by segmentation with deep\nconvolutional neural networks (CNN), currently the state-of-the-art approach.\nHowever, to date, few work has been done to equip volume segmentation tools\nwith adequate quantitative predictive intervals, which can hinder their\nusefulness and acceptation in clinical practice. In this work, we propose\nTriadNet, a segmentation approach relying on a multi-head CNN architecture,\nwhich provides both the lesion volumes and the associated predictive intervals\nsimultaneously, in less than a second. We demonstrate its superiority over\nother solutions on BraTS 2021, a large-scale MRI glioblastoma image database.\n","authors":["Benjamin Lambert","Florence Forbes","Senan Doyle","Michel Dojat"],"pdf_url":"https://arxiv.org/pdf/2307.15638v1.pdf","comment":"Accepted for presentation at the Workshop on Uncertainty for Safe\n  Utilization of Machine Learning in Medical Imaging (UNSURE) at MICCAI 2023"},{"id":"http://arxiv.org/abs/2302.10909v2","updated":"2023-07-28T15:52:27Z","published":"2023-02-14T01:40:06Z","title":"Multi-modal Machine Learning in Engineering Design: A Review and Future\n  Directions","summary":"  In the rapidly advancing field of multi-modal machine learning (MMML), the\nconvergence of multiple data modalities has the potential to reshape various\napplications. This paper presents a comprehensive overview of the current\nstate, advancements, and challenges of MMML within the sphere of engineering\ndesign. The review begins with a deep dive into five fundamental concepts of\nMMML:multi-modal information representation, fusion, alignment, translation,\nand co-learning. Following this, we explore the cutting-edge applications of\nMMML, placing a particular emphasis on tasks pertinent to engineering design,\nsuch as cross-modal synthesis, multi-modal prediction, and cross-modal\ninformation retrieval. Through this comprehensive overview, we highlight the\ninherent challenges in adopting MMML in engineering design, and proffer\npotential directions for future research. To spur on the continued evolution of\nMMML in engineering design, we advocate for concentrated efforts to construct\nextensive multi-modal design datasets, develop effective data-driven MMML\ntechniques tailored to design applications, and enhance the scalability and\ninterpretability of MMML models. MMML models, as the next generation of\nintelligent design tools, hold a promising future to impact how products are\ndesigned.\n","authors":["Binyang Song","Rui Zhou","Faez Ahmed"],"pdf_url":"https://arxiv.org/pdf/2302.10909v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15625v1","updated":"2023-07-28T15:32:14Z","published":"2023-07-28T15:32:14Z","title":"A Comparative Analysis of Machine Learning Methods for Lane Change\n  Intention Recognition Using Vehicle Trajectory Data","summary":"  Accurately detecting and predicting lane change (LC)processes can help\nautonomous vehicles better understand their surrounding environment, recognize\npotential safety hazards, and improve traffic safety. This paper focuses on LC\nprocesses and compares different machine learning methods' performance to\nrecognize LC intention from high-dimensionality time series data. To validate\nthe performance of the proposed models, a total number of 1023 vehicle\ntrajectories is extracted from the CitySim dataset. For LC intention\nrecognition issues, the results indicate that with ninety-eight percent of\nclassification accuracy, ensemble methods reduce the impact of Type II and Type\nIII classification errors. Without sacrificing recognition accuracy, the\nLightGBM demonstrates a sixfold improvement in model training efficiency than\nthe XGBoost algorithm.\n","authors":["Renteng Yuan"],"pdf_url":"https://arxiv.org/pdf/2307.15625v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.13732"},{"id":"http://arxiv.org/abs/2307.15621v1","updated":"2023-07-28T15:29:52Z","published":"2023-07-28T15:29:52Z","title":"Shrink-Perturb Improves Architecture Mixing during Population Based\n  Training for Neural Architecture Search","summary":"  In this work, we show that simultaneously training and mixing neural networks\nis a promising way to conduct Neural Architecture Search (NAS). For\nhyperparameter optimization, reusing the partially trained weights allows for\nefficient search, as was previously demonstrated by the Population Based\nTraining (PBT) algorithm. We propose PBT-NAS, an adaptation of PBT to NAS where\narchitectures are improved during training by replacing poorly-performing\nnetworks in a population with the result of mixing well-performing ones and\ninheriting the weights using the shrink-perturb technique. After PBT-NAS\nterminates, the created networks can be directly used without retraining.\nPBT-NAS is highly parallelizable and effective: on challenging tasks (image\ngeneration and reinforcement learning) PBT-NAS achieves superior performance\ncompared to baselines (random search and mutation-based PBT).\n","authors":["Alexander Chebykin","Arkadiy Dushatskiy","Tanja Alderliesten","Peter A. N. Bosman"],"pdf_url":"https://arxiv.org/pdf/2307.15621v1.pdf","comment":"10 pages, 7 figures. Accepted at ECAI 2023"},{"id":"http://arxiv.org/abs/2307.15593v1","updated":"2023-07-28T14:52:08Z","published":"2023-07-28T14:52:08Z","title":"Robust Distortion-free Watermarks for Language Models","summary":"  We propose a methodology for planting watermarks in text from an\nautoregressive language model that are robust to perturbations without changing\nthe distribution over text up to a certain maximum generation budget. We\ngenerate watermarked text by mapping a sequence of random numbers -- which we\ncompute using a randomized watermark key -- to a sample from the language\nmodel. To detect watermarked text, any party who knows the key can align the\ntext to the random number sequence. We instantiate our watermark methodology\nwith two sampling schemes: inverse transform sampling and exponential minimum\nsampling. We apply these watermarks to three language models -- OPT-1.3B,\nLLaMA-7B and Alpaca-7B -- to experimentally validate their statistical power\nand robustness to various paraphrasing attacks. Notably, for both the OPT-1.3B\nand LLaMA-7B models, we find we can reliably detect watermarked text ($p \\leq\n0.01$) from $35$ tokens even after corrupting between $40$-$50$\\% of the tokens\nvia random edits (i.e., substitutions, insertions or deletions). For the\nAlpaca-7B model, we conduct a case study on the feasibility of watermarking\nresponses to typical user instructions. Due to the lower entropy of the\nresponses, detection is more difficult: around $25\\%$ of the responses -- whose\nmedian length is around $100$ tokens -- are detectable with $p \\leq 0.01$, and\nthe watermark is also less robust to certain automated paraphrasing attacks we\nimplement.\n","authors":["Rohith Kuditipudi","John Thickstun","Tatsunori Hashimoto","Percy Liang"],"pdf_url":"https://arxiv.org/pdf/2307.15593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.08221v2","updated":"2023-07-28T14:22:38Z","published":"2021-11-16T04:31:02Z","title":"Fairness-aware Online Price Discrimination with Nonparametric Demand\n  Models","summary":"  Price discrimination, which refers to the strategy of setting different\nprices for different customer groups, has been widely used in online retailing.\nAlthough it helps boost the collected revenue for online retailers, it might\ncreate serious concerns about fairness, which even violates the regulation and\nlaws. This paper studies the problem of dynamic discriminatory pricing under\nfairness constraints. In particular, we consider a finite selling horizon of\nlength $T$ for a single product with two groups of customers. Each group of\ncustomers has its unknown demand function that needs to be learned. For each\nselling period, the seller determines the price for each group and observes\ntheir purchase behavior. While existing literature mainly focuses on maximizing\nrevenue, ensuring fairness among different customers has not been fully\nexplored in the dynamic pricing literature. This work adopts the fairness\nnotion from Cohen et al. (2022). For price fairness, we propose an optimal\ndynamic pricing policy regarding regret, which enforces the strict price\nfairness constraint. In contrast to the standard $\\sqrt{T}$-type regret in\nonline learning, we show that the optimal regret in our case is\n$\\tilde{O}(T^{4/5})$. We further extend our algorithm to a more general notion\nof fairness, which includes demand fairness as a special case. To handle this\ngeneral class, we propose a soft fairness constraint and develop a dynamic\npricing policy that achieves $\\tilde{O}(T^{4/5})$ regret. We also demonstrate\nthat our algorithmic techniques can be adapted to more general scenarios such\nas fairness among multiple groups of customers.\n","authors":["Xi Chen","Jiameng Lyu","Xuan Zhang","Yuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2111.08221v2.pdf","comment":"73 pages"},{"id":"http://arxiv.org/abs/2304.00818v2","updated":"2023-07-28T14:02:32Z","published":"2023-04-03T09:07:17Z","title":"Swarm Reinforcement Learning For Adaptive Mesh Refinement","summary":"  The Finite Element Method, an important technique in engineering, is aided by\nAdaptive Mesh Refinement (AMR), which dynamically refines mesh regions to allow\nfor a favorable trade-off between computational speed and simulation accuracy.\nClassical methods for AMR depend on task-specific heuristics or expensive error\nestimators, hindering their use for complex simulations. Recent learned AMR\nmethods tackle these problems, but so far scale only to simple toy examples. We\nformulate AMR as a novel Adaptive Swarm Markov Decision Process in which a mesh\nis modeled as a system of simple collaborating agents that may split into\nmultiple new agents. This framework allows for a spatial reward formulation\nthat simplifies the credit assignment problem, which we combine with Message\nPassing Networks to propagate information between neighboring mesh elements. We\nexperimentally validate the effectiveness of our approach, Adaptive Swarm Mesh\nRefinement (ASMR), showing that it learns reliable, scalable, and efficient\nrefinement strategies on a set of challenging problems. Our approach\nsignificantly speeds up computation, achieving up to 30-fold improvement\ncompared to uniform refinements in complex simulations. Additionally, we\noutperform learned baselines and achieve a refinement quality that is on par\nwith a traditional error-based AMR strategy without expensive oracle\ninformation about the error signal.\n","authors":["Niklas Freymuth","Philipp Dahlinger","Tobias Würth","Simon Reisch","Luise Kärger","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2304.00818v2.pdf","comment":"Version 1 of this paper is a preliminary workshop version that was\n  accepted as a workshop paper in the ICLR 2023 Workshop on Physics for Machine\n  Learning"},{"id":"http://arxiv.org/abs/2307.15557v1","updated":"2023-07-28T13:50:57Z","published":"2023-07-28T13:50:57Z","title":"Dynamic algorithms for k-center on graphs","summary":"  In this paper we give the first efficient algorithms for the $k$-center\nproblem on dynamic graphs undergoing edge updates. In this problem, the goal is\nto partition the input into $k$ sets by choosing $k$ centers such that the\nmaximum distance from any data point to the closest center is minimized. It is\nknown that it is NP-hard to get a better than $2$ approximation for this\nproblem.\n  While in many applications the input may naturally be modeled as a graph, all\nprior works on $k$-center problem in dynamic settings are on metrics. In this\npaper, we give a deterministic decremental $(2+\\epsilon)$-approximation\nalgorithm and a randomized incremental $(4+\\epsilon)$-approximation algorithm,\nboth with amortized update time $kn^{o(1)}$ for weighted graphs. Moreover, we\nshow a reduction that leads to a fully dynamic $(2+\\epsilon)$-approximation\nalgorithm for the $k$-center problem, with worst-case update time that is\nwithin a factor $k$ of the state-of-the-art upper bound for maintaining\n$(1+\\epsilon)$-approximate single-source distances in graphs. Matching this\nbound is a natural goalpost because the approximate distances of each vertex to\nits center can be used to maintain a $(2+\\epsilon)$-approximation of the graph\ndiameter and the fastest known algorithms for such a diameter approximation\nalso rely on maintaining approximate single-source distances.\n","authors":["Emilio Cruciani","Sebastian Forster","Gramoz Goranci","Yasamin Nazari","Antonis Skarlatos"],"pdf_url":"https://arxiv.org/pdf/2307.15557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.01696v2","updated":"2023-07-28T13:38:06Z","published":"2022-03-03T13:03:06Z","title":"Fail-Safe Adversarial Generative Imitation Learning","summary":"  For flexible yet safe imitation learning (IL), we propose theory and a\nmodular method, with a safety layer that enables a closed-form probability\ndensity/gradient of the safe generative continuous policy, end-to-end\ngenerative adversarial training, and worst-case safety guarantees. The safety\nlayer maps all actions into a set of safe actions, and uses the\nchange-of-variables formula plus additivity of measures for the density. The\nset of safe actions is inferred by first checking safety of a finite sample of\nactions via adversarial reachability analysis of fallback maneuvers, and then\nconcluding on the safety of these actions' neighborhoods using, e.g., Lipschitz\ncontinuity. We provide theoretical analysis showing the robustness advantage of\nusing the safety layer already during training (imitation error linear in the\nhorizon) compared to only using it at test time (up to quadratic error). In an\nexperiment on real-world driver interaction data, we empirically demonstrate\ntractability, safety and imitation performance of our approach.\n","authors":["Philipp Geiger","Christoph-Nikolas Straehle"],"pdf_url":"https://arxiv.org/pdf/2203.01696v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11164v2","updated":"2023-07-28T13:29:58Z","published":"2023-05-18T17:52:58Z","title":"Exploring the Carbon Footprint of Hugging Face's ML Models: A Repository\n  Mining Study","summary":"  The rise of machine learning (ML) systems has exacerbated their carbon\nfootprint due to increased capabilities and model sizes. However, there is\nscarce knowledge on how the carbon footprint of ML models is actually measured,\nreported, and evaluated. In light of this, the paper aims to analyze the\nmeasurement of the carbon footprint of 1,417 ML models and associated datasets\non Hugging Face, which is the most popular repository for pretrained ML models.\nThe goal is to provide insights and recommendations on how to report and\noptimize the carbon efficiency of ML models. The study includes the first\nrepository mining study on the Hugging Face Hub API on carbon emissions. This\nstudy seeks to answer two research questions: (1) how do ML model creators\nmeasure and report carbon emissions on Hugging Face Hub?, and (2) what aspects\nimpact the carbon emissions of training ML models? The study yielded several\nkey findings. These include a stalled proportion of carbon emissions-reporting\nmodels, a slight decrease in reported carbon footprint on Hugging Face over the\npast 2 years, and a continued dominance of NLP as the main application domain.\nFurthermore, the study uncovers correlations between carbon emissions and\nvarious attributes such as model size, dataset size, and ML application\ndomains. These results highlight the need for software measurements to improve\nenergy reporting practices and promote carbon-efficient model development\nwithin the Hugging Face community. In response to this issue, two\nclassifications are proposed: one for categorizing models based on their carbon\nemission reporting practices and another for their carbon efficiency. The aim\nof these classification proposals is to foster transparency and sustainable\nmodel development within the ML community.\n","authors":["Joel Castaño","Silverio Martínez-Fernández","Xavier Franch","Justus Bogner"],"pdf_url":"https://arxiv.org/pdf/2305.11164v2.pdf","comment":"Accepted at the 2023 ACM/IEEE International Symposium on Empirical\n  Software Engineering and Measurement (ESEM)"},{"id":"http://arxiv.org/abs/2211.04965v3","updated":"2023-07-28T13:23:21Z","published":"2022-11-09T15:29:03Z","title":"Resource frugal optimizer for quantum machine learning","summary":"  Quantum-enhanced data science, also known as quantum machine learning (QML),\nis of growing interest as an application of near-term quantum computers.\nVariational QML algorithms have the potential to solve practical problems on\nreal hardware, particularly when involving quantum data. However, training\nthese algorithms can be challenging and calls for tailored optimization\nprocedures. Specifically, QML applications can require a large shot-count\noverhead due to the large datasets involved. In this work, we advocate for\nsimultaneous random sampling over both the dataset as well as the measurement\noperators that define the loss function. We consider a highly general loss\nfunction that encompasses many QML applications, and we show how to construct\nan unbiased estimator of its gradient. This allows us to propose a shot-frugal\ngradient descent optimizer called Refoqus (REsource Frugal Optimizer for\nQUantum Stochastic gradient descent). Our numerics indicate that Refoqus can\nsave several orders of magnitude in shot cost, even relative to optimizers that\nsample over measurement operators alone.\n","authors":["Charles Moussa","Max Hunter Gordon","Michal Baczyk","M. Cerezo","Lukasz Cincio","Patrick J. Coles"],"pdf_url":"https://arxiv.org/pdf/2211.04965v3.pdf","comment":"22 pages, 6 figures - extra quantum autoencoder results added - extra\n  affiliation"},{"id":"http://arxiv.org/abs/2307.15546v1","updated":"2023-07-28T13:22:32Z","published":"2023-07-28T13:22:32Z","title":"On the Trade-off Between Efficiency and Precision of Neural Abstraction","summary":"  Neural abstractions have been recently introduced as formal approximations of\ncomplex, nonlinear dynamical models. They comprise a neural ODE and a certified\nupper bound on the error between the abstract neural network and the concrete\ndynamical model. So far neural abstractions have exclusively been obtained as\nneural networks consisting entirely of $ReLU$ activation functions, resulting\nin neural ODE models that have piecewise affine dynamics, and which can be\nequivalently interpreted as linear hybrid automata. In this work, we observe\nthat the utility of an abstraction depends on its use: some scenarios might\nrequire coarse abstractions that are easier to analyse, whereas others might\nrequire more complex, refined abstractions. We therefore consider neural\nabstractions of alternative shapes, namely either piecewise constant or\nnonlinear non-polynomial (specifically, obtained via sigmoidal activations). We\nemploy formal inductive synthesis procedures to generate neural abstractions\nthat result in dynamical models with these semantics. Empirically, we\ndemonstrate the trade-off that these different neural abstraction templates\nhave vis-a-vis their precision and synthesis time, as well as the time required\nfor their safety verification (done via reachability computation). We improve\nexisting synthesis techniques to enable abstraction of higher-dimensional\nmodels, and additionally discuss the abstraction of complex neural ODEs to\nimprove the efficiency of reachability analysis for these models.\n","authors":["Alec Edwards","Mirco Giacobbe","Alessandro Abate"],"pdf_url":"https://arxiv.org/pdf/2307.15546v1.pdf","comment":"To appear at QEST 2023"},{"id":"http://arxiv.org/abs/2209.14272v2","updated":"2023-07-28T13:18:01Z","published":"2022-09-28T17:36:47Z","title":"Towards Multimodal Prediction of Spontaneous Humour: A Novel Dataset and\n  First Results","summary":"  Humour is a substantial element of human affect and cognition. Its automatic\nunderstanding can facilitate a more naturalistic human-device interaction and\nthe humanisation of artificial intelligence. Current methods of humour\ndetection are solely based on staged data making them inadequate for\n'real-world' applications. We address this deficiency by introducing the novel\nPassau-Spontaneous Football Coach Humour (Passau-SFCH) dataset, comprising of\nabout 11 hours of recordings. The Passau-SFCH dataset is annotated for the\npresence of humour and its dimensions (sentiment and direction) as proposed in\nMartin's Humor Style Questionnaire. We conduct a series of experiments,\nemploying pretrained Transformers, convolutional neural networks, and\nexpert-designed features. The performance of each modality (text, audio, video)\nfor spontaneous humour recognition is analysed and their complementarity is\ninvestigated. Our findings suggest that for the automatic analysis of humour\nand its sentiment, facial expressions are most promising, while humour\ndirection can be best modelled via text-based features. The results reveal\nconsiderable differences among various subjects, highlighting the individuality\nof humour usage and style. Further, we observe that a decision-level fusion\nyields the best recognition result. Finally, we make our code publicly\navailable at https://www.github.com/EIHW/passau-sfch. The Passau-SFCH dataset\nis available upon request.\n","authors":["Lukas Christ","Shahin Amiriparian","Alexander Kathan","Niklas Müller","Andreas König","Björn W. Schuller"],"pdf_url":"https://arxiv.org/pdf/2209.14272v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible (Major Revision)"},{"id":"http://arxiv.org/abs/2307.15539v1","updated":"2023-07-28T13:07:42Z","published":"2023-07-28T13:07:42Z","title":"Backdoor Defense with Non-Adversarial Backdoor","summary":"  Deep neural networks (DNNs) are vulnerable to backdoor attack, which does not\naffect the network's performance on clean data but would manipulate the network\nbehavior once a trigger pattern is added. Existing defense methods have greatly\nreduced attack success rate, but their prediction accuracy on clean data still\nlags behind a clean model by a large margin. Inspired by the stealthiness and\neffectiveness of backdoor attack, we propose a simple but highly effective\ndefense framework which injects non-adversarial backdoors targeting poisoned\nsamples. Following the general steps in backdoor attack, we detect a small set\nof suspected samples and then apply a poisoning strategy to them. The\nnon-adversarial backdoor, once triggered, suppresses the attacker's backdoor on\npoisoned data, but has limited influence on clean data. The defense can be\ncarried out during data preprocessing, without any modification to the standard\nend-to-end training pipeline. We conduct extensive experiments on multiple\nbenchmarks with different architectures and representative attacks. Results\ndemonstrate that our method achieves state-of-the-art defense effectiveness\nwith by far the lowest performance drop on clean data. Considering the\nsurprising defense ability displayed by our framework, we call for more\nattention to utilizing backdoor for backdoor defense. Code is available at\nhttps://github.com/damianliumin/non-adversarial_backdoor.\n","authors":["Min Liu","Alberto Sangiovanni-Vincentelli","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2307.15539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05965v3","updated":"2023-07-28T12:25:01Z","published":"2023-06-09T15:33:30Z","title":"Automating Model Comparison in Factor Graphs","summary":"  Bayesian state and parameter estimation have been automated effectively in a\nvariety of probabilistic programming languages. The process of model comparison\non the other hand, which still requires error-prone and time-consuming manual\nderivations, is often overlooked despite its importance. This paper efficiently\nautomates Bayesian model averaging, selection, and combination by message\npassing on a Forney-style factor graph with a custom mixture node. Parameter\nand state inference, and model comparison can then be executed simultaneously\nusing message passing with scale factors. This approach shortens the model\ndesign cycle and allows for the straightforward extension to hierarchical and\ntemporal model priors to accommodate for modeling complicated time-varying\nprocesses.\n","authors":["Bart van Erp","Wouter W. L. Nuijten","Thijs van de Laar","Bert de Vries"],"pdf_url":"https://arxiv.org/pdf/2306.05965v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15503v1","updated":"2023-07-28T11:58:26Z","published":"2023-07-28T11:58:26Z","title":"The Applicability of Federated Learning to Official Statistics","summary":"  This work investigates the potential of Federated Learning (FL) for official\nstatistics and shows how well the performance of FL models can keep up with\ncentralized learning methods. At the same time, its utilization can safeguard\nthe privacy of data holders, thus facilitating access to a broader range of\ndata and ultimately enhancing official statistics. By simulating three\ndifferent use cases, important insights on the applicability of the technology\nare gained. The use cases are based on a medical insurance data set, a fine\ndust pollution data set and a mobile radio coverage data set - all of which are\nfrom domains close to official statistics. We provide a detailed analysis of\nthe results, including a comparison of centralized and FL algorithm\nperformances for each simulation. In all three use cases, we were able to train\nmodels via FL which reach a performance very close to the centralized model\nbenchmarks. Our key observations and their implications for transferring the\nsimulations into practice are summarized. We arrive at the conclusion that FL\nhas the potential to emerge as a pivotal technology in future use cases of\nofficial statistics.\n","authors":["Joshua Stock","Oliver Hauke","Julius Weißmann","Hannes Federrath"],"pdf_url":"https://arxiv.org/pdf/2307.15503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01170v3","updated":"2023-07-28T11:52:56Z","published":"2023-03-02T11:21:03Z","title":"Expert-Free Online Transfer Learning in Multi-Agent Reinforcement\n  Learning","summary":"  Transfer learning in Reinforcement Learning (RL) has been widely studied to\novercome training issues of Deep-RL, i.e., exploration cost, data availability\nand convergence time, by introducing a way to enhance training phase with\nexternal knowledge. Generally, knowledge is transferred from expert-agents to\nnovices. While this fixes the issue for a novice agent, a good understanding of\nthe task on expert agent is required for such transfer to be effective. As an\nalternative, in this paper we propose Expert-Free Online Transfer Learning\n(EF-OnTL), an algorithm that enables expert-free real-time dynamic transfer\nlearning in multi-agent system. No dedicated expert exists, and transfer source\nagent and knowledge to be transferred are dynamically selected at each transfer\nstep based on agents' performance and uncertainty. To improve uncertainty\nestimation, we also propose State Action Reward Next-State Random Network\nDistillation (sars-RND), an extension of RND that estimates uncertainty from RL\nagent-environment interaction. We demonstrate EF-OnTL effectiveness against a\nno-transfer scenario and advice-based baselines, with and without expert\nagents, in three benchmark tasks: Cart-Pole, a grid-based Multi-Team\nPredator-Prey (mt-pp) and Half Field Offense (HFO). Our results show that\nEF-OnTL achieve overall comparable performance when compared against\nadvice-based baselines while not requiring any external input nor threshold\ntuning. EF-OnTL outperforms no-transfer with an improvement related to the\ncomplexity of the task addressed.\n","authors":["Alberto Castagna","Ivana Dusparic"],"pdf_url":"https://arxiv.org/pdf/2303.01170v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07189v2","updated":"2023-07-28T11:49:02Z","published":"2023-03-13T15:30:28Z","title":"Optimizing Convolutional Neural Networks for Chronic Obstructive\n  Pulmonary Disease Detection in Clinical Computed Tomography Imaging","summary":"  Purpose: To optimize the binary detection of Chronic Obstructive Pulmonary\nDisease (COPD) based on emphysema presence in the lung with convolutional\nneural networks (CNN) by exploring manually adjusted versus automated\nwindow-setting optimization (WSO) on computed tomography (CT) images.\n  Methods: 7,194 CT images (3,597 with COPD; 3,597 healthy controls) from 78\nsubjects (43 with COPD; 35 healthy controls) were selected retrospectively\n(10.2018-12.2019) and preprocessed. For each image, intensity values were\nmanually clipped to the emphysema window setting and a baseline 'full-range'\nwindow setting. Class-balanced train, validation, and test sets contained\n3,392, 1,114, and 2,688 images. The network backbone was optimized by comparing\nvarious CNN architectures. Furthermore, automated WSO was implemented by adding\na customized layer to the model. The image-level area under the Receiver\nOperating Characteristics curve (AUC) [lower, upper limit 95% confidence] and\nP-values calculated from one-sided Mann-Whitney U-test were utilized to compare\nmodel variations.\n  Results: Repeated inference (n=7) on the test set showed that the DenseNet\nwas the most efficient backbone and achieved a mean AUC of 0.80 [0.76, 0.85]\nwithout WSO. Comparably, with input images manually adjusted to the emphysema\nwindow, the DenseNet model predicted COPD with a mean AUC of 0.86 [0.82, 0.89]\n(P=0.03). By adding a customized WSO layer to the DenseNet, an optimal window\nin the proximity of the emphysema window setting was learned automatically, and\na mean AUC of 0.82 [0.78, 0.86] was achieved.\n  Conclusion: Detection of COPD with DenseNet models was improved by WSO of CT\ndata to the emphysema window setting range.\n","authors":["Tina Dorosti","Manuel Schultheiss","Felix Hofmann","Johannes Thalhammer","Luisa Kirchner","Theresa Urban","Franz Pfeiffer","Florian Schaff","Tobias Lasser","Daniela Pfeiffer"],"pdf_url":"https://arxiv.org/pdf/2303.07189v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15496v1","updated":"2023-07-28T11:44:06Z","published":"2023-07-28T11:44:06Z","title":"From continuous-time formulations to discretization schemes: tensor\n  trains and robust regression for BSDEs and parabolic PDEs","summary":"  The numerical approximation of partial differential equations (PDEs) poses\nformidable challenges in high dimensions since classical grid-based methods\nsuffer from the so-called curse of dimensionality. Recent attempts rely on a\ncombination of Monte Carlo methods and variational formulations, using neural\nnetworks for function approximation. Extending previous work (Richter et al.,\n2021), we argue that tensor trains provide an appealing framework for parabolic\nPDEs: The combination of reformulations in terms of backward stochastic\ndifferential equations and regression-type methods holds the promise of\nleveraging latent low-rank structures, enabling both compression and efficient\ncomputation. Emphasizing a continuous-time viewpoint, we develop iterative\nschemes, which differ in terms of computational efficiency and robustness. We\ndemonstrate both theoretically and numerically that our methods can achieve a\nfavorable trade-off between accuracy and computational efficiency. While\nprevious methods have been either accurate or fast, we have identified a novel\nnumerical strategy that can often combine both of these aspects.\n","authors":["Lorenz Richter","Leon Sallandt","Nikolas Nüsken"],"pdf_url":"https://arxiv.org/pdf/2307.15496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14935v2","updated":"2023-07-28T11:02:25Z","published":"2023-07-27T15:26:26Z","title":"Solving Data Quality Problems with Desbordante: a Demo","summary":"  Data profiling is an essential process in modern data-driven industries. One\nof its critical components is the discovery and validation of complex\nstatistics, including functional dependencies, data constraints, association\nrules, and others.\n  However, most existing data profiling systems that focus on complex\nstatistics do not provide proper integration with the tools used by\ncontemporary data scientists. This creates a significant barrier to the\nadoption of these tools in the industry. Moreover, existing systems were not\ncreated with industrial-grade workloads in mind. Finally, they do not aim to\nprovide descriptive explanations, i.e. why a given pattern is not found. It is\na significant issue as it is essential to understand the underlying reasons for\na specific pattern's absence to make informed decisions based on the data.\n  Because of that, these patterns are effectively rest in thin air: their\napplication scope is rather limited, they are rarely used by the broader\npublic. At the same time, as we are going to demonstrate in this presentation,\ncomplex statistics can be efficiently used to solve many classic data quality\nproblems.\n  Desbordante is an open-source data profiler that aims to close this gap. It\nis built with emphasis on industrial application: it is efficient, scalable,\nresilient to crashes, and provides explanations. Furthermore, it provides\nseamless Python integration by offloading various costly operations to the C++\ncore, not only mining.\n  In this demonstration, we show several scenarios that allow end users to\nsolve different data quality problems. Namely, we showcase typo detection, data\ndeduplication, and data anomaly detection scenarios.\n","authors":["George Chernishev","Michael Polyntsov","Anton Chizhov","Kirill Stupakov","Ilya Shchuckin","Alexander Smirnov","Maxim Strutovsky","Alexey Shlyonskikh","Mikhail Firsov","Stepan Manannikov","Nikita Bobrov","Daniil Goncharov","Ilia Barutkin","Vladislav Shalnev","Kirill Muraviev","Anna Rakhmukova","Dmitriy Shcheka","Anton Chernikov","Mikhail Vyrodov","Yaroslav Kurbatov","Maxim Fofanov","Sergei Belokonnyi","Pavel Anosov","Arthur Saliou","Eduard Gaisin","Kirill Smirnov"],"pdf_url":"https://arxiv.org/pdf/2307.14935v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15475v1","updated":"2023-07-28T11:00:29Z","published":"2023-07-28T11:00:29Z","title":"FeedbackLogs: Recording and Incorporating Stakeholder Feedback into\n  Machine Learning Pipelines","summary":"  Even though machine learning (ML) pipelines affect an increasing array of\nstakeholders, there is little work on how input from stakeholders is recorded\nand incorporated. We propose FeedbackLogs, addenda to existing documentation of\nML pipelines, to track the input of multiple stakeholders. Each log records\nimportant details about the feedback collection process, the feedback itself,\nand how the feedback is used to update the ML pipeline. In this paper, we\nintroduce and formalise a process for collecting a FeedbackLog. We also provide\nconcrete use cases where FeedbackLogs can be employed as evidence for\nalgorithmic auditing and as a tool to record updates based on stakeholder\nfeedback.\n","authors":["Matthew Barker","Emma Kallina","Dhananjay Ashok","Katherine M. Collins","Ashley Casovan","Adrian Weller","Ameet Talwalkar","Valerie Chen","Umang Bhatt"],"pdf_url":"https://arxiv.org/pdf/2307.15475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.00641v6","updated":"2023-07-28T10:58:39Z","published":"2021-04-01T17:40:21Z","title":"Dynamic Silos: Increased Modularity in Intra-organizational\n  Communication Networks during the Covid-19 Pandemic","summary":"  Workplace communications around the world were drastically altered by\nCovid-19, related work-from-home orders, and the rise of remote work. To\nunderstand these shifts, we analyzed aggregated, anonymized metadata from over\n360 billion emails within 4,361 organizations worldwide. By comparing\nmonth-to-month and year-over-year metrics, we examined changes in network\ncommunity structures over 24 months before and after Covid-19. We also examined\nshifts across multiple communication media (email, instant messages, video\ncalls, and calendaring software) within a single global organization, and\ncompared them to communications shifts that were driven by changes in formal\norganizational structure. We found that, in 2020, organizations around the\nworld became more siloed than in 2019, evidenced by increased modularity. This\nshift was concurrent with decreased stability within silos. Collectively, our\nanalyses indicate that following the onset of Covid-19, employees began to\nshift more dynamically between subcommunities (teams, workgroups or functional\nareas). At the same time, once in a subcommunity, they limited their\ncommunication to other members of that community. We term these network changes\ndynamic silos. We provide initial insights into the meaning and implications of\ndynamic silos for the future of work.\n","authors":["Tiona Zuzul","Emily Cox Pahnke","Jonathan Larson","Patrick Bourke","Nicholas Caurvina","Neha Parikh Shah","Fereshteh Amini","Jeffrey Weston","Youngser Park","Joshua Vogelstein","Christopher White","Carey E. Priebe"],"pdf_url":"https://arxiv.org/pdf/2104.00641v6.pdf","comment":"48 pages, 15 figures"},{"id":"http://arxiv.org/abs/2306.00607v2","updated":"2023-07-28T10:57:29Z","published":"2023-06-01T12:25:43Z","title":"FACT: Federated Adversarial Cross Training","summary":"  Federated Learning (FL) facilitates distributed model development to\naggregate multiple confidential data sources. The information transfer among\nclients can be compromised by distributional differences, i.e., by non-i.i.d.\ndata. A particularly challenging scenario is the federated model adaptation to\na target client without access to annotated data. We propose Federated\nAdversarial Cross Training (FACT), which uses the implicit domain differences\nbetween source clients to identify domain shifts in the target domain. In each\nround of FL, FACT cross initializes a pair of source clients to generate domain\nspecialized representations which are then used as a direct adversary to learn\na domain invariant data representation. We empirically show that FACT\noutperforms state-of-the-art federated, non-federated and source-free domain\nadaptation models on three popular multi-source-single-target benchmarks, and\nstate-of-the-art Unsupervised Domain Adaptation (UDA) models on\nsingle-source-single-target experiments. We further study FACT's behavior with\nrespect to communication restrictions and the number of participating clients.\n","authors":["Stefan Schrod","Jonas Lippl","Andreas Schäfer","Michael Altenbuchinger"],"pdf_url":"https://arxiv.org/pdf/2306.00607v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01995v2","updated":"2023-07-28T10:55:47Z","published":"2023-07-05T02:56:29Z","title":"Dynamic Feature-based Deep Reinforcement Learning for Flow Control of\n  Circular Cylinder with Sparse Surface Pressure Sensing","summary":"  This study proposes a self-learning algorithm for closed-loop cylinder wake\ncontrol targeting lower drag and lower lift fluctuations with the additional\nchallenge of sparse sensor information, taking deep reinforcement learning as\nthe starting point. DRL performance is significantly improved by lifting the\nsensor signals to dynamic features (DF), which predict future flow states. The\nresulting dynamic feature-based DRL (DF-DRL) automatically learns a feedback\ncontrol in the plant without a dynamic model. Results show that the drag\ncoefficient of the DF-DRL model is 25% less than the vanilla model based on\ndirect sensor feedback. More importantly, using only one surface pressure\nsensor, DF-DRL can reduce the drag coefficient to a state-of-the-art\nperformance of about 8% at Re = 100 and significantly mitigate lift coefficient\nfluctuations. Hence, DF-DRL allows the deployment of sparse sensing of the flow\nwithout degrading the control performance. This method also shows good\nrobustness in controlling flow under higher Reynolds numbers, which reduces the\ndrag coefficient by 32.2% and 46.55% at Re = 500 and 1000, respectively,\nindicating the broad applicability of the method. Since surface pressure\ninformation is more straightforward to measure in realistic scenarios than flow\nvelocity information, this study provides a valuable reference for\nexperimentally designing the active flow control of a circular cylinder based\non wall pressure signals, which is an essential step toward further developing\nintelligent control in realistic multi-input multi-output (MIMO) system.\n","authors":["Qiulei Wang","Lei Yan","Gang Hu","Wenli Chen","Bernd R. Noack"],"pdf_url":"https://arxiv.org/pdf/2307.01995v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15466v1","updated":"2023-07-28T10:37:49Z","published":"2023-07-28T10:37:49Z","title":"LUCID-GAN: Conditional Generative Models to Locate Unfairness","summary":"  Most group fairness notions detect unethical biases by computing statistical\nparity metrics on a model's output. However, this approach suffers from several\nshortcomings, such as philosophical disagreement, mutual incompatibility, and\nlack of interpretability. These shortcomings have spurred the research on\ncomplementary bias detection methods that offer additional transparency into\nthe sources of discrimination and are agnostic towards an a priori decision on\nthe definition of fairness and choice of protected features. A recent proposal\nin this direction is LUCID (Locating Unfairness through Canonical Inverse\nDesign), where canonical sets are generated by performing gradient descent on\nthe input space, revealing a model's desired input given a preferred output.\nThis information about the model's mechanisms, i.e., which feature values are\nessential to obtain specific outputs, allows exposing potential unethical\nbiases in its internal logic. Here, we present LUCID-GAN, which generates\ncanonical inputs via a conditional generative model instead of gradient-based\ninverse design. LUCID-GAN has several benefits, including that it applies to\nnon-differentiable models, ensures that canonical sets consist of realistic\ninputs, and allows to assess proxy and intersectional discrimination. We\nempirically evaluate LUCID-GAN on the UCI Adult and COMPAS data sets and show\nthat it allows for detecting unethical biases in black-box models without\nrequiring access to the training data.\n","authors":["Andres Algaba","Carmen Mazijn","Carina Prunkl","Jan Danckaert","Vincent Ginis"],"pdf_url":"https://arxiv.org/pdf/2307.15466v1.pdf","comment":"24 pages, 6 figures, 1st World Conference on eXplainable Artificial\n  Intelligence"},{"id":"http://arxiv.org/abs/2302.08854v2","updated":"2023-07-28T10:33:51Z","published":"2023-02-17T12:53:15Z","title":"Post-Episodic Reinforcement Learning Inference","summary":"  We consider estimation and inference with data collected from episodic\nreinforcement learning (RL) algorithms; i.e. adaptive experimentation\nalgorithms that at each period (aka episode) interact multiple times in a\nsequential manner with a single treated unit. Our goal is to be able to\nevaluate counterfactual adaptive policies after data collection and to estimate\nstructural parameters such as dynamic treatment effects, which can be used for\ncredit assignment (e.g. what was the effect of the first period action on the\nfinal outcome). Such parameters of interest can be framed as solutions to\nmoment equations, but not minimizers of a population loss function, leading to\n$Z$-estimation approaches in the case of static data. However, such estimators\nfail to be asymptotically normal in the case of adaptive data collection. We\npropose a re-weighted $Z$-estimation approach with carefully designed adaptive\nweights to stabilize the episode-varying estimation variance, which results\nfrom the nonstationary policy that typical episodic RL algorithms invoke. We\nidentify proper weighting schemes to restore the consistency and asymptotic\nnormality of the re-weighted Z-estimators for target parameters, which allows\nfor hypothesis testing and constructing uniform confidence regions for target\nparameters of interest. Primary applications include dynamic treatment effect\nestimation and dynamic off-policy evaluation.\n","authors":["Vasilis Syrgkanis","Ruohan Zhan"],"pdf_url":"https://arxiv.org/pdf/2302.08854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11465v3","updated":"2023-07-28T10:20:13Z","published":"2023-07-21T10:01:55Z","title":"A Deep Learning Approach for Overall Survival Prediction in Lung Cancer\n  with Missing Values","summary":"  One of the most challenging fields where Artificial Intelligence (AI) can be\napplied is lung cancer research, specifically non-small cell lung cancer\n(NSCLC). In particular, overall survival (OS), the time between diagnosis and\ndeath, is a vital indicator of patient status, enabling tailored treatment and\nimproved OS rates. In this analysis, there are two challenges to take into\naccount. First, few studies effectively exploit the information available from\neach patient, leveraging both uncensored (i.e., dead) and censored (i.e.,\nsurvivors) patients, considering also the events' time. Second, the handling of\nincomplete data is a common issue in the medical field. This problem is\ntypically tackled through the use of imputation methods. Our objective is to\npresent an AI model able to overcome these limits, effectively learning from\nboth censored and uncensored patients and their available features, for the\nprediction of OS for NSCLC patients. We present a novel approach to survival\nanalysis with missing values in the context of NSCLC, which exploits the\nstrengths of the transformer architecture to account only for available\nfeatures without requiring any imputation strategy. By making use of ad-hoc\nlosses for OS, it is able to account for both censored and uncensored patients,\nas well as changes in risks over time. We compared our method with\nstate-of-the-art models for survival analysis coupled with different imputation\nstrategies. We evaluated the results obtained over a period of 6 years using\ndifferent time granularities obtaining a Ct-index, a time-dependent variant of\nthe C-index, of 71.97, 77.58 and 80.72 for time units of 1 month, 1 year and 2\nyears, respectively, outperforming all state-of-the-art methods regardless of\nthe imputation method used.\n","authors":["Camillo Maria Caruso","Valerio Guarrasi","Sara Ramella","Paolo Soda"],"pdf_url":"https://arxiv.org/pdf/2307.11465v3.pdf","comment":"20 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.15456v1","updated":"2023-07-28T10:20:08Z","published":"2023-07-28T10:20:08Z","title":"Worrisome Properties of Neural Network Controllers and Their Symbolic\n  Representations","summary":"  We raise concerns about controllers' robustness in simple reinforcement\nlearning benchmark problems. We focus on neural network controllers and their\nlow neuron and symbolic abstractions. A typical controller reaching high mean\nreturn values still generates an abundance of persistent low-return solutions,\nwhich is a highly undesirable property, easily exploitable by an adversary. We\nfind that the simpler controllers admit more persistent bad solutions. We\nprovide an algorithm for a systematic robustness study and prove existence of\npersistent solutions and, in some cases, periodic orbits, using a\ncomputer-assisted proof methodology.\n","authors":["Jacek Cyranka","Kevin E M Church","Jean-Philippe Lessard"],"pdf_url":"https://arxiv.org/pdf/2307.15456v1.pdf","comment":"accepted to ECAI23"},{"id":"http://arxiv.org/abs/2306.14680v2","updated":"2023-07-28T10:11:19Z","published":"2023-06-26T13:23:52Z","title":"A Conditional Flow Variational Autoencoder for Controllable Synthesis of\n  Virtual Populations of Anatomy","summary":"  The generation of virtual populations (VPs) of anatomy is essential for\nconducting in silico trials of medical devices. Typically, the generated VP\nshould capture sufficient variability while remaining plausible and should\nreflect the specific characteristics and demographics of the patients observed\nin real populations. In several applications, it is desirable to synthesise\nvirtual populations in a \\textit{controlled} manner, where relevant covariates\nare used to conditionally synthesise virtual populations that fit a specific\ntarget population/characteristics. We propose to equip a conditional\nvariational autoencoder (cVAE) with normalising flows to boost the flexibility\nand complexity of the approximate posterior learnt, leading to enhanced\nflexibility for controllable synthesis of VPs of anatomical structures. We\ndemonstrate the performance of our conditional flow VAE using a data set of\ncardiac left ventricles acquired from 2360 patients, with associated\ndemographic information and clinical measurements (used as\ncovariates/conditional information). The results obtained indicate the\nsuperiority of the proposed method for conditional synthesis of virtual\npopulations of cardiac left ventricles relative to a cVAE. Conditional\nsynthesis performance was evaluated in terms of generalisation and specificity\nerrors and in terms of the ability to preserve clinically relevant biomarkers\nin synthesised VPs, that is, the left ventricular blood pool and myocardial\nvolume, relative to the real observed population.\n","authors":["Haoran Dou","Nishant Ravikumar","Alejandro F. Frangi"],"pdf_url":"https://arxiv.org/pdf/2306.14680v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.15438v1","updated":"2023-07-28T09:40:19Z","published":"2023-07-28T09:40:19Z","title":"Autonomous Payload Thermal Control","summary":"  In small satellites there is less room for heat control equipment, scientific\ninstruments, and electronic components. Furthermore, the near proximity of the\nelectronics makes power dissipation difficult, with the risk of not being able\nto control the temperature appropriately, reducing component lifetime and\nmission performance. To address this challenge, taking advantage of the advent\nof increasing intelligence on board satellites, a deep reinforcement learning\nbased framework that uses Soft Actor-Critic algorithm is proposed for learning\nthe thermal control policy onboard. The framework is evaluated both in a naive\nsimulated environment and in a real space edge processing computer that will be\nshipped in the future IMAGIN-e mission and hosted in the ISS. The experiment\nresults show that the proposed framework is able to learn to control the\npayload processing power to maintain the temperature under operational ranges,\ncomplementing traditional thermal control systems.\n","authors":["Alejandro D. Mousist"],"pdf_url":"https://arxiv.org/pdf/2307.15438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12717v2","updated":"2023-07-28T09:27:53Z","published":"2023-01-30T08:21:18Z","title":"Automatic Intersection Management in Mixed Traffic Using Reinforcement\n  Learning and Graph Neural Networks","summary":"  Connected automated driving has the potential to significantly improve urban\ntraffic efficiency, e.g., by alleviating issues due to occlusion. Cooperative\nbehavior planning can be employed to jointly optimize the motion of multiple\nvehicles. Most existing approaches to automatic intersection management,\nhowever, only consider fully automated traffic. In practice, mixed traffic,\ni.e., the simultaneous road usage by automated and human-driven vehicles, will\nbe prevalent. The present work proposes to leverage reinforcement learning and\na graph-based scene representation for cooperative multi-agent planning. We\nbuild upon our previous works that showed the applicability of such machine\nlearning methods to fully automated traffic. The scene representation is\nextended for mixed traffic and considers uncertainty in the human drivers'\nintentions. In the simulation-based evaluation, we model measurement\nuncertainties through noise processes that are tuned using real-world data. The\npaper evaluates the proposed method against an enhanced first in - first out\nscheme, our baseline for mixed traffic management. With increasing share of\nautomated vehicles, the learned planner significantly increases the vehicle\nthroughput and reduces the delay due to interaction. Non-automated vehicles\nbenefit virtually alike.\n","authors":["Marvin Klimke","Benjamin Völz","Michael Buchholz"],"pdf_url":"https://arxiv.org/pdf/2301.12717v2.pdf","comment":"8 pages, 7 figures, 34th IEEE Intelligent Vehicles Symposium (IV),\n  updated to accepted version"},{"id":"http://arxiv.org/abs/2307.15429v1","updated":"2023-07-28T09:26:03Z","published":"2023-07-28T09:26:03Z","title":"Improvable Gap Balancing for Multi-Task Learning","summary":"  In multi-task learning (MTL), gradient balancing has recently attracted more\nresearch interest than loss balancing since it often leads to better\nperformance. However, loss balancing is much more efficient than gradient\nbalancing, and thus it is still worth further exploration in MTL. Note that\nprior studies typically ignore that there exist varying improvable gaps across\nmultiple tasks, where the improvable gap per task is defined as the distance\nbetween the current training progress and desired final training progress.\nTherefore, after loss balancing, the performance imbalance still arises in many\ncases. In this paper, following the loss balancing framework, we propose two\nnovel improvable gap balancing (IGB) algorithms for MTL: one takes a simple\nheuristic, and the other (for the first time) deploys deep reinforcement\nlearning for MTL. Particularly, instead of directly balancing the losses in\nMTL, both algorithms choose to dynamically assign task weights for improvable\ngap balancing. Moreover, we combine IGB and gradient balancing to show the\ncomplementarity between the two types of algorithms. Extensive experiments on\ntwo benchmark datasets demonstrate that our IGB algorithms lead to the best\nresults in MTL via loss balancing and achieve further improvements when\ncombined with gradient balancing. Code is available at\nhttps://github.com/YanqiDai/IGB4MTL.\n","authors":["Yanqi Dai","Nanyi Fei","Zhiwu Lu"],"pdf_url":"https://arxiv.org/pdf/2307.15429v1.pdf","comment":"Accepted for the 39th Conference on Uncertainty in Artificial\n  Intelligence (UAI 2023)"},{"id":"http://arxiv.org/abs/2307.15428v1","updated":"2023-07-28T09:26:00Z","published":"2023-07-28T09:26:00Z","title":"Implicit neural representation for change detection","summary":"  Detecting changes that occurred in a pair of 3D airborne LiDAR point clouds,\nacquired at two different times over the same geographical area, is a\nchallenging task because of unmatching spatial supports and acquisition system\nnoise. Most recent attempts to detect changes on point clouds are based on\nsupervised methods, which require large labelled data unavailable in real-world\napplications. To address these issues, we propose an unsupervised approach that\ncomprises two components: Neural Field (NF) for continuous shape reconstruction\nand a Gaussian Mixture Model for categorising changes. NF offer a grid-agnostic\nrepresentation to encode bi-temporal point clouds with unmatched spatial\nsupport that can be regularised to increase high-frequency details and reduce\nnoise. The reconstructions at each timestamp are compared at arbitrary spatial\nscales, leading to a significant increase in detection capabilities. We apply\nour method to a benchmark dataset of simulated LiDAR point clouds for urban\nsprawling. The dataset offers different challenging scenarios with different\nresolutions, input modalities and noise levels, allowing a multi-scenario\ncomparison of our method with the current state-of-the-art. We boast the\nprevious methods on this dataset by a 10% margin in intersection over union\nmetric. In addition, we apply our methods to a real-world scenario to identify\nillegal excavation (looting) of archaeological sites and confirm that they\nmatch findings from field experts.\n","authors":["Peter Naylor","Diego Di Carlo","Arianna Traviglia","Makoto Yamada","Marco Fiorucci"],"pdf_url":"https://arxiv.org/pdf/2307.15428v1.pdf","comment":"Main article is 10 pages + 3 pages of supplementary. Conference style\n  paper"},{"id":"http://arxiv.org/abs/2307.15424v1","updated":"2023-07-28T09:17:03Z","published":"2023-07-28T09:17:03Z","title":"Deep Generative Models, Synthetic Tabular Data, and Differential\n  Privacy: An Overview and Synthesis","summary":"  This article provides a comprehensive synthesis of the recent developments in\nsynthetic data generation via deep generative models, focusing on tabular\ndatasets. We specifically outline the importance of synthetic data generation\nin the context of privacy-sensitive data. Additionally, we highlight the\nadvantages of using deep generative models over other methods and provide a\ndetailed explanation of the underlying concepts, including unsupervised\nlearning, neural networks, and generative models. The paper covers the\nchallenges and considerations involved in using deep generative models for\ntabular datasets, such as data normalization, privacy concerns, and model\nevaluation. This review provides a valuable resource for researchers and\npractitioners interested in synthetic data generation and its applications.\n","authors":["Conor Hassan","Robert Salomone","Kerrie Mengersen"],"pdf_url":"https://arxiv.org/pdf/2307.15424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15422v1","updated":"2023-07-28T09:14:41Z","published":"2023-07-28T09:14:41Z","title":"Is One Epoch All You Need For Multi-Fidelity Hyperparameter\n  Optimization?","summary":"  Hyperparameter optimization (HPO) is crucial for fine-tuning machine learning\nmodels but can be computationally expensive. To reduce costs, Multi-fidelity\nHPO (MF-HPO) leverages intermediate accuracy levels in the learning process and\ndiscards low-performing models early on. We compared various representative\nMF-HPO methods against a simple baseline on classical benchmark data. The\nbaseline involved discarding all models except the Top-K after training for\nonly one epoch, followed by further training to select the best model.\nSurprisingly, this baseline achieved similar results to its counterparts, while\nrequiring an order of magnitude less computation. Upon analyzing the learning\ncurves of the benchmark data, we observed a few dominant learning curves, which\nexplained the success of our baseline. This suggests that researchers should\n(1) always use the suggested baseline in benchmarks and (2) broaden the\ndiversity of MF-HPO benchmarks to include more complex cases.\n","authors":["Romain Egele","Isabelle Guyon","Yixuan Sun","Prasanna Balaprakash"],"pdf_url":"https://arxiv.org/pdf/2307.15422v1.pdf","comment":"5 pages, with extended appendices"},{"id":"http://arxiv.org/abs/2307.14940v2","updated":"2023-07-28T08:54:26Z","published":"2023-07-27T15:32:02Z","title":"A Self-Adaptive Penalty Method for Integrating Prior Knowledge\n  Constraints into Neural ODEs","summary":"  The continuous dynamics of natural systems has been effectively modelled\nusing Neural Ordinary Differential Equations (Neural ODEs). However, for\naccurate and meaningful predictions, it is crucial that the models follow the\nunderlying rules or laws that govern these systems. In this work, we propose a\nself-adaptive penalty algorithm for Neural ODEs to enable modelling of\nconstrained natural systems. The proposed self-adaptive penalty function can\ndynamically adjust the penalty parameters. The explicit introduction of prior\nknowledge helps to increase the interpretability of Neural ODE -based models.\nWe validate the proposed approach by modelling three natural systems with prior\nknowledge constraints: population growth, chemical reaction evolution, and\ndamped harmonic oscillator motion. The numerical experiments and a comparison\nwith other penalty Neural ODE approaches and \\emph{vanilla} Neural ODE,\ndemonstrate the effectiveness of the proposed self-adaptive penalty algorithm\nfor Neural ODEs in modelling constrained natural systems. Moreover, the\nself-adaptive penalty approach provides more accurate and robust models with\nreliable and meaningful predictions.\n","authors":["C. Coelho","M. Fernanda P. Costa","L. L. Ferrás"],"pdf_url":"https://arxiv.org/pdf/2307.14940v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.13289v4","updated":"2023-07-28T08:50:56Z","published":"2022-11-23T20:32:33Z","title":"Shapley Curves: A Smoothing Perspective","summary":"  Originating from cooperative game theory, Shapley values have become one of\nthe most widely used measures for variable importance in applied Machine\nLearning. However, the statistical understanding of Shapley values is still\nlimited. In this paper, we take a nonparametric (or smoothing) perspective by\nintroducing Shapley curves as a local measure of variable importance. We\nconsider two estimation strategies and derive the consistency and asymptotic\nnormality both under independence and dependence among the features. We further\npropose a novel version of the wild bootstrap procedure specifically adjusted\nfor Shapley curves. This allows us to construct confidence intervals and\nconduct inference. The asymptotic results are validated in extensive\nexperiments. In an empirical application, we analyze which attributes drive the\nprices of vehicles.\n","authors":["Ratmir Miftachov","Georg Keilbar","Wolfgang Karl Härdle"],"pdf_url":"https://arxiv.org/pdf/2211.13289v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15398v1","updated":"2023-07-28T08:48:32Z","published":"2023-07-28T08:48:32Z","title":"The Initial Screening Order Problem","summary":"  In this paper we present the initial screening order problem, a crucial step\nwithin candidate screening. It involves a human-like screener with an objective\nto find the first k suitable candidates rather than the best k suitable\ncandidates in a candidate pool given an initial screening order. The initial\nscreening order represents the way in which the human-like screener arranges\nthe candidate pool prior to screening. The choice of initial screening order\nhas considerable effects on the selected set of k candidates. We prove that\nunder an unbalanced candidate pool (e.g., having more male than female\ncandidates), the human-like screener can suffer from uneven efforts that hinder\nits decision-making over the protected, under-represented group relative to the\nnon-protected, over-represented group. Other fairness results are proven under\nthe human-like screener. This research is based on a collaboration with a large\ncompany to better understand its hiring process for potential automation. Our\nmain contribution is the formalization of the initial screening order problem\nwhich, we argue, opens the path for future extensions of the current works on\nranking algorithms, fairness, and automation for screening procedures.\n","authors":["Jose M. Alvarez","Salvatore Ruggieri"],"pdf_url":"https://arxiv.org/pdf/2307.15398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15396v1","updated":"2023-07-28T08:41:12Z","published":"2023-07-28T08:41:12Z","title":"Noisy Interpolation Learning with Shallow Univariate ReLU Networks","summary":"  We study the asymptotic overfitting behavior of interpolation with minimum\nnorm ($\\ell_2$ of the weights) two-layer ReLU networks for noisy univariate\nregression. We show that overfitting is tempered for the $L_1$ loss, and any\n$L_p$ loss for $p<2$, but catastrophic for $p\\geq 2$.\n","authors":["Nirmit Joshi","Gal Vardi","Nathan Srebro"],"pdf_url":"https://arxiv.org/pdf/2307.15396v1.pdf","comment":"41 pages"},{"id":"http://arxiv.org/abs/2307.15388v1","updated":"2023-07-28T08:32:11Z","published":"2023-07-28T08:32:11Z","title":"Does Full Waveform Inversion Benefit from Big Data?","summary":"  This paper investigates the impact of big data on deep learning models for\nfull waveform inversion (FWI). While it is well known that big data can boost\nthe performance of deep learning models in many tasks, its effectiveness has\nnot been validated for FWI. To address this gap, we present an empirical study\nthat investigates how deep learning models in FWI behave when trained on\nOpenFWI, a collection of large-scale, multi-structural datasets published\nrecently. Particularly, we train and evaluate the FWI models on a combination\nof 10 2D subsets in OpenFWI that contain 470K data pairs in total. Our\nexperiments demonstrate that larger datasets lead to better performance and\ngeneralization of deep learning models for FWI. We further demonstrate that\nmodel capacity needs to scale in accordance with data size for optimal\nimprovement.\n","authors":["Peng Jin","Yinan Feng","Shihang Feng","Hanchen Wang","Yinpeng Chen","Benjamin Consolvo","Zicheng Liu","Youzuo Lin"],"pdf_url":"https://arxiv.org/pdf/2307.15388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.09189v4","updated":"2023-07-28T08:27:57Z","published":"2022-08-19T07:28:31Z","title":"Cross-Domain Evaluation of a Deep Learning-Based Type Inference System","summary":"  Optional type annotations allow for enriching dynamic programming languages\nwith static typing features like better Integrated Development Environment\n(IDE) support, more precise program analysis, and early detection and\nprevention of type-related runtime errors. Machine learning-based type\ninference promises interesting results for automating this task. However, the\npractical usage of such systems depends on their ability to generalize across\ndifferent domains, as they are often applied outside their training domain. In\nthis work, we investigate Type4Py as a representative of state-of-the-art deep\nlearning-based type inference systems, by conducting extensive cross-domain\nexperiments. Thereby, we address the following problems: class imbalances,\nout-of-vocabulary words, dataset shifts, and unknown classes. To perform such\nexperiments, we use the datasets ManyTypes4Py and CrossDomainTypes4Py. The\nlatter we introduce in this paper. Our dataset enables the evaluation of type\ninference systems in different domains of software projects and has over\n1,000,000 type annotations mined on the platforms GitHub and Libraries. It\nconsists of data from the two domains web development and scientific\ncalculation. Through our experiments, we detect that the shifts in the dataset\nand the long-tailed distribution with many rare and unknown data types decrease\nthe performance of the deep learning-based type inference system drastically.\nIn this context, we test unsupervised domain adaptation methods and fine-tuning\nto overcome these issues. Moreover, we investigate the impact of\nout-of-vocabulary words.\n","authors":["Bernd Gruner","Tim Sonnekalb","Thomas S. Heinze","Clemens-Alexander Brust"],"pdf_url":"https://arxiv.org/pdf/2208.09189v4.pdf","comment":"Preprint for the MSR'23 technical track"},{"id":"http://arxiv.org/abs/2304.06987v2","updated":"2023-07-28T08:17:47Z","published":"2023-04-14T08:17:05Z","title":"Unsupervised ANN-Based Equalizer and Its Trainable FPGA Implementation","summary":"  In recent years, communication engineers put strong emphasis on artificial\nneural network (ANN)-based algorithms with the aim of increasing the\nflexibility and autonomy of the system and its components. In this context,\nunsupervised training is of special interest as it enables adaptation without\nthe overhead of transmitting pilot symbols. In this work, we present a novel\nANN-based, unsupervised equalizer and its trainable field programmable gate\narray (FPGA) implementation. We demonstrate that our custom loss function\nallows the ANN to adapt for varying channel conditions, approaching the\nperformance of a supervised baseline. Furthermore, as a first step towards a\npractical communication system, we design an efficient FPGA implementation of\nour proposed algorithm, which achieves a throughput in the order of Gbit/s,\noutperforming a high-performance GPU by a large margin.\n","authors":["Jonas Ney","Vincent Lauinger","Laurent Schmalen","Norbert Wehn"],"pdf_url":"https://arxiv.org/pdf/2304.06987v2.pdf","comment":"Accepted and presented at Joint European Conference on Networks and\n  Communications & 6G Summit (EuCNC/6G Summit), Gothenburg, Sweden, 6 - 9 June\n  2023; Published in IEEE Xplore: https://ieeexplore.ieee.org/document/10188269"},{"id":"http://arxiv.org/abs/2210.05304v2","updated":"2023-07-28T08:00:05Z","published":"2022-10-11T09:55:07Z","title":"Learning Provably Stabilizing Neural Controllers for Discrete-Time\n  Stochastic Systems","summary":"  We consider the problem of learning control policies in discrete-time\nstochastic systems which guarantee that the system stabilizes within some\nspecified stabilization region with probability~$1$. Our approach is based on\nthe novel notion of stabilizing ranking supermartingales (sRSMs) that we\nintroduce in this work. Our sRSMs overcome the limitation of methods proposed\nin previous works whose applicability is restricted to systems in which the\nstabilizing region cannot be left once entered under any control policy. We\npresent a learning procedure that learns a control policy together with an sRSM\nthat formally certifies probability~$1$ stability, both learned as neural\nnetworks. We show that this procedure can also be adapted to formally verifying\nthat, under a given Lipschitz continuous control policy, the stochastic system\nstabilizes within some stabilizing region with probability~$1$. Our\nexperimental evaluation shows that our learning procedure can successfully\nlearn provably stabilizing policies in practice.\n","authors":["Matin Ansaripour","Krishnendu Chatterjee","Thomas A. Henzinger","Mathias Lechner","Đorđe Žikelić"],"pdf_url":"https://arxiv.org/pdf/2210.05304v2.pdf","comment":"Accepted at ATVA 2023. Follow-up work of arXiv:2112.09495"},{"id":"http://arxiv.org/abs/2307.15377v1","updated":"2023-07-28T07:53:34Z","published":"2023-07-28T07:53:34Z","title":"Co-attention Graph Pooling for Efficient Pairwise Graph Interaction\n  Learning","summary":"  Graph Neural Networks (GNNs) have proven to be effective in processing and\nlearning from graph-structured data. However, previous works mainly focused on\nunderstanding single graph inputs while many real-world applications require\npair-wise analysis for graph-structured data (e.g., scene graph matching, code\nsearching, and drug-drug interaction prediction). To this end, recent works\nhave shifted their focus to learning the interaction between pairs of graphs.\nDespite their improved performance, these works were still limited in that the\ninteractions were considered at the node-level, resulting in high computational\ncosts and suboptimal performance. To address this issue, we propose a novel and\nefficient graph-level approach for extracting interaction representations using\nco-attention in graph pooling. Our method, Co-Attention Graph Pooling\n(CAGPool), exhibits competitive performance relative to existing methods in\nboth classification and regression tasks using real-world datasets, while\nmaintaining lower computational complexity.\n","authors":["Junhyun Lee","Bumsoo Kim","Minji Jeon","Jaewoo Kang"],"pdf_url":"https://arxiv.org/pdf/2307.15377v1.pdf","comment":"Published at IEEE Access"},{"id":"http://arxiv.org/abs/2304.14831v2","updated":"2023-07-28T07:51:03Z","published":"2023-04-28T13:16:54Z","title":"Earning Extra Performance from Restrictive Feedbacks","summary":"  Many machine learning applications encounter a situation where model\nproviders are required to further refine the previously trained model so as to\ngratify the specific need of local users. This problem is reduced to the\nstandard model tuning paradigm if the target data is permissibly fed to the\nmodel. However, it is rather difficult in a wide range of practical cases where\ntarget data is not shared with model providers but commonly some evaluations\nabout the model are accessible. In this paper, we formally set up a challenge\nnamed \\emph{Earning eXtra PerformancE from restriCTive feEDdbacks} (EXPECTED)\nto describe this form of model tuning problems. Concretely, EXPECTED admits a\nmodel provider to access the operational performance of the candidate model\nmultiple times via feedback from a local user (or a group of users). The goal\nof the model provider is to eventually deliver a satisfactory model to the\nlocal user(s) by utilizing the feedbacks. Unlike existing model tuning methods\nwhere the target data is always ready for calculating model gradients, the\nmodel providers in EXPECTED only see some feedbacks which could be as simple as\nscalars, such as inference accuracy or usage rate. To enable tuning in this\nrestrictive circumstance, we propose to characterize the geometry of the model\nperformance with regard to model parameters through exploring the parameters'\ndistribution. In particular, for the deep models whose parameters distribute\nacross multiple layers, a more query-efficient algorithm is further\ntailor-designed that conducts layerwise tuning with more attention to those\nlayers which pay off better. Extensive experiments on different applications\ndemonstrate that our work forges a sound solution to the EXPECTED problem. Code\nis available via https://github.com/kylejingli/EXPECTED.\n","authors":["Jing Li","Yuangang Pan","Yueming Lyu","Yinghua Yao","Yulei Sui","Ivor W. Tsang"],"pdf_url":"https://arxiv.org/pdf/2304.14831v2.pdf","comment":"Accepted by IEEE TPAMI in April 2023"},{"id":"http://arxiv.org/abs/2302.12202v2","updated":"2023-07-28T07:50:22Z","published":"2023-02-23T17:55:11Z","title":"A Definition of Non-Stationary Bandits","summary":"  Despite the subject of non-stationary bandit learning having attracted much\nrecent attention, we have yet to identify a formal definition of\nnon-stationarity that can consistently distinguish non-stationary bandits from\nstationary ones. Prior work has characterized non-stationary bandits as bandits\nfor which the reward distribution changes over time. We demonstrate that this\ndefinition can ambiguously classify the same bandit as both stationary and\nnon-stationary; this ambiguity arises in the existing definition's dependence\non the latent sequence of reward distributions. Moreover, the definition has\ngiven rise to two widely used notions of regret: the dynamic regret and the\nweak regret. These notions are not indicative of qualitative agent performance\nin some bandits. Additionally, this definition of non-stationary bandits has\nled to the design of agents that explore excessively. We introduce a formal\ndefinition of non-stationary bandits that resolves these issues. Our new\ndefinition provides a unified approach, applicable seamlessly to both Bayesian\nand frequentist formulations of bandits. Furthermore, our definition ensures\nconsistent classification of two bandits offering agents indistinguishable\nexperiences, categorizing them as either both stationary or both\nnon-stationary. This advancement provides a more robust framework for\nnon-stationary bandit learning.\n","authors":["Yueyang Liu","Xu Kuang","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2302.12202v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15373v1","updated":"2023-07-28T07:45:44Z","published":"2023-07-28T07:45:44Z","title":"Conflict-free joint decision by lag and zero-lag synchronization in\n  laser network","summary":"  With the end of Moore's Law and the increasing demand for computing, photonic\naccelerators are garnering considerable attention. This is due to the physical\ncharacteristics of light, such as high bandwidth and multiplicity, and the\nvarious synchronization phenomena that emerge in the realm of laser physics.\nThese factors come into play as computer performance approaches its limits. In\nthis study, we explore the application of a laser network, acting as a photonic\naccelerator, to the competitive multi-armed bandit problem. In this context,\nconflict avoidance is key to maximizing environmental rewards. We\nexperimentally demonstrate cooperative decision-making using zero-lag and lag\nsynchronization within a network of four semiconductor lasers. Lag\nsynchronization of chaos realizes effective decision-making and zero-delay\nsynchronization is responsible for the realization of the collision avoidance\nfunction. We experimentally verified a low collision rate and high reward in a\nfundamental 2-player, 2-slot scenario, and showed the scalability of this\nsystem. This system architecture opens up new possibilities for intelligent\nfunctionalities in laser dynamics.\n","authors":["Hisako Ito","Takatomo Mihana","Ryoichi Horisaki","Makoto Naruse"],"pdf_url":"https://arxiv.org/pdf/2307.15373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15367v1","updated":"2023-07-28T07:34:44Z","published":"2023-07-28T07:34:44Z","title":"Toward Transparent Sequence Models with Model-Based Tree Markov Model","summary":"  In this study, we address the interpretability issue in complex, black-box\nMachine Learning models applied to sequence data. We introduce the Model-Based\ntree Hidden Semi-Markov Model (MOB-HSMM), an inherently interpretable model\naimed at detecting high mortality risk events and discovering hidden patterns\nassociated with the mortality risk in Intensive Care Units (ICU). This model\nleverages knowledge distilled from Deep Neural Networks (DNN) to enhance\npredictive performance while offering clear explanations. Our experimental\nresults indicate the improved performance of Model-Based trees (MOB trees) via\nemploying LSTM for learning sequential patterns, which are then transferred to\nMOB trees. Integrating MOB trees with the Hidden Semi-Markov Model (HSMM) in\nthe MOB-HSMM enables uncovering potential and explainable sequences using\navailable information.\n","authors":["Chan Hsu","Wei-Chun Huang","Jun-Ting Wu","Chih-Yuan Li","Yihuang Kang"],"pdf_url":"https://arxiv.org/pdf/2307.15367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15361v1","updated":"2023-07-28T07:23:01Z","published":"2023-07-28T07:23:01Z","title":"Confident Feature Ranking","summary":"  Interpretation of feature importance values often relies on the relative\norder of the features rather than on the value itself, referred to as ranking.\nHowever, the order may be unstable due to the small sample sizes used in\ncalculating the importance values. We propose that post-hoc importance methods\nproduce a ranking and simultaneous confident intervals for the rankings. Based\non pairwise comparisons of the feature importance values, our method is\nguaranteed to include the ``true'' (infinite sample) ranking with high\nprobability and allows for selecting top-k sets.\n","authors":["Bitya Neuhof","Yuval Benjamini"],"pdf_url":"https://arxiv.org/pdf/2307.15361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15343v1","updated":"2023-07-28T06:43:04Z","published":"2023-07-28T06:43:04Z","title":"Med-HALT: Medical Domain Hallucination Test for Large Language Models","summary":"  This research paper focuses on the challenges posed by hallucinations in\nlarge language models (LLMs), particularly in the context of the medical\ndomain. Hallucination, wherein these models generate plausible yet unverified\nor incorrect information, can have serious consequences in healthcare\napplications. We propose a new benchmark and dataset, Med-HALT (Medical Domain\nHallucination Test), designed specifically to evaluate and reduce\nhallucinations. Med-HALT provides a diverse multinational dataset derived from\nmedical examinations across various countries and includes multiple innovative\ntesting modalities. Med-HALT includes two categories of tests reasoning and\nmemory-based hallucination tests, designed to assess LLMs's problem-solving and\ninformation retrieval abilities.\n  Our study evaluated leading LLMs, including Text Davinci, GPT-3.5, LlaMa-2,\nMPT, and Falcon, revealing significant differences in their performance. The\npaper provides detailed insights into the dataset, promoting transparency and\nreproducibility. Through this work, we aim to contribute to the development of\nsafer and more reliable language models in healthcare. Our benchmark can be\nfound at medhalt.github.io\n","authors":["Logesh Kumar Umapathi","Ankit Pal","Malaikannan Sankarasubbu"],"pdf_url":"https://arxiv.org/pdf/2307.15343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10839v3","updated":"2023-07-28T06:36:12Z","published":"2022-12-21T08:27:49Z","title":"Consistent Range Approximation for Fair Predictive Modeling","summary":"  This paper proposes a novel framework for certifying the fairness of\npredictive models trained on biased data. It draws from query answering for\nincomplete and inconsistent databases to formulate the problem of consistent\nrange approximation (CRA) of fairness queries for a predictive model on a\ntarget population. The framework employs background knowledge of the data\ncollection process and biased data, working with or without limited statistics\nabout the target population, to compute a range of answers for fairness\nqueries. Using CRA, the framework builds predictive models that are certifiably\nfair on the target population, regardless of the availability of external data\nduring training. The framework's efficacy is demonstrated through evaluations\non real data, showing substantial improvement over existing state-of-the-art\nmethods.\n","authors":["Jiongli Zhu","Sainyam Galhotra","Nazanin Sabri","Babak Salimi"],"pdf_url":"https://arxiv.org/pdf/2212.10839v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15339v1","updated":"2023-07-28T06:32:33Z","published":"2023-07-28T06:32:33Z","title":"The Radon Signed Cumulative Distribution Transform and its applications\n  in classification of Signed Images","summary":"  Here we describe a new image representation technique based on the\nmathematics of transport and optimal transport. The method relies on the\ncombination of the well-known Radon transform for images and a recent signal\nrepresentation method called the Signed Cumulative Distribution Transform. The\nnewly proposed method generalizes previous transport-related image\nrepresentation methods to arbitrary functions (images), and thus can be used in\nmore applications. We describe the new transform, and some of its mathematical\nproperties and demonstrate its ability to partition image classes with real and\nsimulated data. In comparison to existing transport transform methods, as well\nas deep learning-based classification methods, the new transform more\naccurately represents the information content of signed images, and thus can be\nused to obtain higher classification accuracies. The implementation of the\nproposed method in Python language is integrated as a part of the software\npackage PyTransKit, available on Github.\n","authors":["Le Gong","Shiying Li","Naqib Sad Pathan","Mohammad Shifat-E-Rabbi","Gustavo K. Rohde","Abu Hasnat Mohammad Rubaiyat","Sumati Thareja"],"pdf_url":"https://arxiv.org/pdf/2307.15339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13494v4","updated":"2023-07-28T06:22:04Z","published":"2023-07-25T13:42:22Z","title":"Duet: efficient and scalable hybriD neUral rElation undersTanding","summary":"  Learned cardinality estimation methods have achieved high precision compared\nto traditional methods. Among learned methods, query-driven approaches face the\ndata and workload drift problem for a long time. Although both query-driven and\nhybrid methods are proposed to avoid this problem, even the state-of-the-art of\nthem suffer from high training and estimation costs, limited scalability,\ninstability, and long-tailed distribution problem on high cardinality and\nhigh-dimensional tables, which seriously affects the practical application of\nlearned cardinality estimators. In this paper, we prove that most of these\nproblems are directly caused by the widely used progressive sampling. We solve\nthis problem by introducing predicates information into the autoregressive\nmodel and propose Duet, a stable, efficient, and scalable hybrid method to\nestimate cardinality directly without sampling or any non-differentiable\nprocess, which can not only reduces the inference complexity from O(n) to O(1)\ncompared to Naru and UAE but also achieve higher accuracy on high cardinality\nand high-dimensional tables. Experimental results show that Duet can achieve\nall the design goals above and be much more practical and even has a lower\ninference cost on CPU than that of most learned methods on GPU.\n","authors":["Kaixin Zhang","Hongzhi Wang","Yabin Lu","Ziqi Li","Chang Shu","Yu Yan","Donghua Yang"],"pdf_url":"https://arxiv.org/pdf/2307.13494v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15326v1","updated":"2023-07-28T06:04:46Z","published":"2023-07-28T06:04:46Z","title":"Staging E-Commerce Products for Online Advertising using Retrieval\n  Assisted Image Generation","summary":"  Online ads showing e-commerce products typically rely on the product images\nin a catalog sent to the advertising platform by an e-commerce platform. In the\nbroader ads industry such ads are called dynamic product ads (DPA). It is\ncommon for DPA catalogs to be in the scale of millions (corresponding to the\nscale of products which can be bought from the e-commerce platform). However,\nnot all product images in the catalog may be appealing when directly\nre-purposed as an ad image, and this may lead to lower click-through rates\n(CTRs). In particular, products just placed against a solid background may not\nbe as enticing and realistic as a product staged in a natural environment. To\naddress such shortcomings of DPA images at scale, we propose a generative\nadversarial network (GAN) based approach to generate staged backgrounds for\nun-staged product images. Generating the entire staged background is a\nchallenging task susceptible to hallucinations. To get around this, we\nintroduce a simpler approach called copy-paste staging using retrieval assisted\nGANs. In copy paste staging, we first retrieve (from the catalog) staged\nproducts similar to the un-staged input product, and then copy-paste the\nbackground of the retrieved product in the input image. A GAN based in-painting\nmodel is used to fill the holes left after this copy-paste operation. We show\nthe efficacy of our copy-paste staging method via offline metrics, and human\nevaluation. In addition, we show how our staging approach can enable animations\nof moving products leading to a video ad from a product image.\n","authors":["Yueh-Ning Ku","Mikhail Kuznetsov","Shaunak Mishra","Paloma de Juan"],"pdf_url":"https://arxiv.org/pdf/2307.15326v1.pdf","comment":"Accepted for publication in AdKDD 2023"},{"id":"http://arxiv.org/abs/2305.02757v3","updated":"2023-07-28T06:04:11Z","published":"2023-05-04T11:50:19Z","title":"Multi-Domain Learning From Insufficient Annotations","summary":"  Multi-domain learning (MDL) refers to simultaneously constructing a model or\na set of models on datasets collected from different domains. Conventional\napproaches emphasize domain-shared information extraction and domain-private\ninformation preservation, following the shared-private framework (SP models),\nwhich offers significant advantages over single-domain learning. However, the\nlimited availability of annotated data in each domain considerably hinders the\neffectiveness of conventional supervised MDL approaches in real-world\napplications. In this paper, we introduce a novel method called multi-domain\ncontrastive learning (MDCL) to alleviate the impact of insufficient annotations\nby capturing both semantic and structural information from both labeled and\nunlabeled data.Specifically, MDCL comprises two modules: inter-domain semantic\nalignment and intra-domain contrast. The former aims to align annotated\ninstances of the same semantic category from distinct domains within a shared\nhidden space, while the latter focuses on learning a cluster structure of\nunlabeled instances in a private hidden space for each domain. MDCL is readily\ncompatible with many SP models, requiring no additional model parameters and\nallowing for end-to-end training. Experimental results across five textual and\nimage multi-domain datasets demonstrate that MDCL brings noticeable improvement\nover various SP models.Furthermore, MDCL can further be employed in\nmulti-domain active learning (MDAL) to achieve a superior initialization,\neventually leading to better overall performance.\n","authors":["Rui He","Shengcai Liu","Jiahao Wu","Shan He","Ke Tang"],"pdf_url":"https://arxiv.org/pdf/2305.02757v3.pdf","comment":"This paper has been accepted to ECAI-23"},{"id":"http://arxiv.org/abs/2307.15325v1","updated":"2023-07-28T06:03:19Z","published":"2023-07-28T06:03:19Z","title":"Partial observations, coarse graining and equivariance in Koopman\n  operator theory for large-scale dynamical systems","summary":"  The Koopman operator has become an essential tool for data-driven analysis,\nprediction and control of complex systems, the main reason being the enormous\npotential of identifying linear function space representations of nonlinear\ndynamics from measurements. Until now, the situation where for large-scale\nsystems, we (i) only have access to partial observations (i.e., measurements,\nas is very common for experimental data) or (ii) deliberately perform coarse\ngraining (for efficiency reasons) has not been treated to its full extent. In\nthis paper, we address the pitfall associated with this situation, that the\nclassical EDMD algorithm does not automatically provide a Koopman operator\napproximation for the underlying system if we do not carefully select the\nnumber of observables. Moreover, we show that symmetries in the system dynamics\ncan be carried over to the Koopman operator, which allows us to massively\nincrease the model efficiency. We also briefly draw a connection to domain\ndecomposition techniques for partial differential equations and present\nnumerical evidence using the Kuramoto--Sivashinsky equation.\n","authors":["Sebastian Peitz","Hans Harder","Feliks Nüske","Friedrich Philipp","Manuel Schaller","Karl Worthmann"],"pdf_url":"https://arxiv.org/pdf/2307.15325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15320v1","updated":"2023-07-28T05:47:24Z","published":"2023-07-28T05:47:24Z","title":"Robust Visual Sim-to-Real Transfer for Robotic Manipulation","summary":"  Learning visuomotor policies in simulation is much safer and cheaper than in\nthe real world. However, due to discrepancies between the simulated and real\ndata, simulator-trained policies often fail when transferred to real robots.\nOne common approach to bridge the visual sim-to-real domain gap is domain\nrandomization (DR). While previous work mainly evaluates DR for disembodied\ntasks, such as pose estimation and object detection, here we systematically\nexplore visual domain randomization methods and benchmark them on a rich set of\nchallenging robotic manipulation tasks. In particular, we propose an off-line\nproxy task of cube localization to select DR parameters for texture\nrandomization, lighting randomization, variations of object colors and camera\nparameters. Notably, we demonstrate that DR parameters have similar impact on\nour off-line proxy task and on-line policies. We, hence, use off-line optimized\nDR parameters to train visuomotor policies in simulation and directly apply\nsuch policies to a real robot. Our approach achieves 93% success rate on\naverage when tested on a diverse set of challenging manipulation tasks.\nMoreover, we evaluate the robustness of policies to visual variations in real\nscenes and show that our simulator-trained policies outperform policies learned\nusing real but limited data. Code, simulation environment, real robot datasets\nand trained models are available at\nhttps://www.di.ens.fr/willow/research/robust_s2r/.\n","authors":["Ricardo Garcia","Robin Strudel","Shizhe Chen","Etienne Arlaud","Ivan Laptev","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2307.15320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15317v1","updated":"2023-07-28T05:32:56Z","published":"2023-07-28T05:32:56Z","title":"DiffKendall: A Novel Approach for Few-Shot Learning with Differentiable\n  Kendall's Rank Correlation","summary":"  Few-shot learning aims to adapt models trained on the base dataset to novel\ntasks where the categories are not seen by the model before. This often leads\nto a relatively uniform distribution of feature values across channels on novel\nclasses, posing challenges in determining channel importance for novel tasks.\nStandard few-shot learning methods employ geometric similarity metrics such as\ncosine similarity and negative Euclidean distance to gauge the semantic\nrelatedness between two features. However, features with high geometric\nsimilarities may carry distinct semantics, especially in the context of\nfew-shot learning. In this paper, we demonstrate that the importance ranking of\nfeature channels is a more reliable indicator for few-shot learning than\ngeometric similarity metrics. We observe that replacing the geometric\nsimilarity metric with Kendall's rank correlation only during inference is able\nto improve the performance of few-shot learning across a wide range of datasets\nwith different domains. Furthermore, we propose a carefully designed\ndifferentiable loss for meta-training to address the non-differentiability\nissue of Kendall's rank correlation. Extensive experiments demonstrate that the\nproposed rank-correlation-based approach substantially enhances few-shot\nlearning performance.\n","authors":["Kaipeng Zheng","Huishuai Zhang","Weiran Huang"],"pdf_url":"https://arxiv.org/pdf/2307.15317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.05606v2","updated":"2023-07-28T04:59:21Z","published":"2022-08-11T02:03:30Z","title":"Multi-fidelity wavelet neural operator with application to uncertainty\n  quantification","summary":"  Operator learning frameworks, because of their ability to learn nonlinear\nmaps between two infinite dimensional functional spaces and utilization of\nneural networks in doing so, have recently emerged as one of the more pertinent\nareas in the field of applied machine learning. Although these frameworks are\nextremely capable when it comes to modeling complex phenomena, they require an\nextensive amount of data for successful training which is often not available\nor is too expensive. However, this issue can be alleviated with the use of\nmulti-fidelity learning, where a model is trained by making use of a large\namount of inexpensive low-fidelity data along with a small amount of expensive\nhigh-fidelity data. To this end, we develop a new framework based on the\nwavelet neural operator which is capable of learning from a multi-fidelity\ndataset. The developed model's excellent learning capabilities are demonstrated\nby solving different problems which require effective correlation learning\nbetween the two fidelities for surrogate construction. Furthermore, we also\nassess the application of the developed framework for uncertainty\nquantification. The results obtained from this work illustrate the excellent\nperformance of the proposed framework.\n","authors":["Akshay Thakur","Tapas Tripura","Souvik Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2208.05606v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.12672v3","updated":"2023-07-28T04:35:15Z","published":"2022-06-25T15:29:20Z","title":"SKTR: Trace Recovery from Stochastically Known Logs","summary":"  Developments in machine learning together with the increasing usage of sensor\ndata challenge the reliance on deterministic logs, requiring new process mining\nsolutions for uncertain, and in particular stochastically known, logs. In this\nwork we formulate {trace recovery}, the task of generating a deterministic log\nfrom stochastically known logs that is as faithful to reality as possible. An\neffective trace recovery algorithm would be a powerful aid for maintaining\ncredible process mining tools for uncertain settings. We propose an algorithmic\nframework for this task that recovers the best alignment between a\nstochastically known log and a process model, with three innovative features.\nOur algorithm, SKTR, 1) handles both Markovian and non-Markovian processes; 2)\noffers a quality-based balance between a process model and a log, depending on\nthe available process information, sensor quality, and machine learning\npredictiveness power; and 3) offers a novel use of a synchronous product\nmultigraph to create the log. An empirical analysis using five publicly\navailable datasets, three of which use predictive models over standard video\ncapturing benchmarks, shows an average relative accuracy improvement of more\nthan 10 over a common baseline.\n","authors":["Eli Bogdanov","Izack Cohen","Avigdor Gal"],"pdf_url":"https://arxiv.org/pdf/2206.12672v3.pdf","comment":"Submitted version -- Accepted to the 5th International Conference on\n  Process Mining (ICPM), 2023"},{"id":"http://arxiv.org/abs/2307.15299v1","updated":"2023-07-28T04:29:53Z","published":"2023-07-28T04:29:53Z","title":"Differential Evolution Algorithm based Hyper-Parameters Selection of\n  Transformer Neural Network Model for Load Forecasting","summary":"  Accurate load forecasting plays a vital role in numerous sectors, but\naccurately capturing the complex dynamics of dynamic power systems remains a\nchallenge for traditional statistical models. For these reasons, time-series\nmodels (ARIMA) and deep-learning models (ANN, LSTM, GRU, etc.) are commonly\ndeployed and often experience higher success. In this paper, we analyze the\nefficacy of the recently developed Transformer-based Neural Network model in\nLoad forecasting. Transformer models have the potential to improve Load\nforecasting because of their ability to learn long-range dependencies derived\nfrom their Attention Mechanism. We apply several metaheuristics namely\nDifferential Evolution to find the optimal hyperparameters of the\nTransformer-based Neural Network to produce accurate forecasts. Differential\nEvolution provides scalable, robust, global solutions to non-differentiable,\nmulti-objective, or constrained optimization problems. Our work compares the\nproposed Transformer based Neural Network model integrated with different\nmetaheuristic algorithms by their performance in Load forecasting based on\nnumerical metrics such as Mean Squared Error (MSE) and Mean Absolute Percentage\nError (MAPE). Our findings demonstrate the potential of metaheuristic-enhanced\nTransformer-based Neural Network models in Load forecasting accuracy and\nprovide optimal hyperparameters for each model.\n","authors":["Anuvab Sen","Arul Rhik Mazumder","Udayon Sen"],"pdf_url":"https://arxiv.org/pdf/2307.15299v1.pdf","comment":"6 Pages, 6 Figures, 2 Tables"},{"id":"http://arxiv.org/abs/2307.15288v1","updated":"2023-07-28T04:01:48Z","published":"2023-07-28T04:01:48Z","title":"Learning Nonlinear Projections for Reduced-Order Modeling of Dynamical\n  Systems using Constrained Autoencoders","summary":"  Recently developed reduced-order modeling techniques aim to approximate\nnonlinear dynamical systems on low-dimensional manifolds learned from data.\nThis is an effective approach for modeling dynamics in a post-transient regime\nwhere the effects of initial conditions and other disturbances have decayed.\nHowever, modeling transient dynamics near an underlying manifold, as needed for\nreal-time control and forecasting applications, is complicated by the effects\nof fast dynamics and nonnormal sensitivity mechanisms. To begin to address\nthese issues, we introduce a parametric class of nonlinear projections\ndescribed by constrained autoencoder neural networks in which both the manifold\nand the projection fibers are learned from data. Our architecture uses\ninvertible activation functions and biorthogonal weight matrices to ensure that\nthe encoder is a left inverse of the decoder. We also introduce new\ndynamics-aware cost functions that promote learning of oblique projection\nfibers that account for fast dynamics and nonnormality. To demonstrate these\nmethods and the specific challenges they address, we provide a detailed case\nstudy of a three-state model of vortex shedding in the wake of a bluff body\nimmersed in a fluid, which has a two-dimensional slow manifold that can be\ncomputed analytically. In anticipation of future applications to\nhigh-dimensional systems, we also propose several techniques for constructing\ncomputationally efficient reduced-order models using our proposed nonlinear\nprojection framework. This includes a novel sparsity-promoting penalty for the\nencoder that avoids detrimental weight matrix shrinkage via computation on the\nGrassmann manifold.\n","authors":["Samuel E. Otto","Gregory R. Macchio","Clarence W. Rowley"],"pdf_url":"https://arxiv.org/pdf/2307.15288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15285v1","updated":"2023-07-28T03:43:17Z","published":"2023-07-28T03:43:17Z","title":"Optimal Approximation of Zonoids and Uniform Approximation by Shallow\n  Neural Networks","summary":"  We study the following two related problems. The first is to determine to\nwhat error an arbitrary zonoid in $\\mathbb{R}^{d+1}$ can be approximated in the\nHausdorff distance by a sum of $n$ line segments. The second is to determine\noptimal approximation rates in the uniform norm for shallow ReLU$^k$ neural\nnetworks on their variation spaces. The first of these problems has been solved\nfor $d\\neq 2,3$, but when $d=2,3$ a logarithmic gap between the best upper and\nlower bounds remains. We close this gap, which completes the solution in all\ndimensions. For the second problem, our techniques significantly improve upon\nexisting approximation rates when $k\\geq 1$, and enable uniform approximation\nof both the target function and its derivatives.\n","authors":["Jonathan W. Siegel"],"pdf_url":"https://arxiv.org/pdf/2307.15285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.10890v2","updated":"2023-07-28T03:41:18Z","published":"2022-10-19T21:09:29Z","title":"Mitigating spectral bias for the multiscale operator learning with\n  hierarchical attention","summary":"  Neural operators have emerged as a powerful tool for learning the mapping\nbetween infinite-dimensional parameter and solution spaces of partial\ndifferential equations (PDEs). In this work, we focus on multiscale PDEs that\nhave important applications such as reservoir modeling and turbulence\nprediction. We demonstrate that for such PDEs, the spectral bias towards\nlow-frequency components presents a significant challenge for existing neural\noperators. To address this challenge, we propose a hierarchical attention\nneural operator (HANO) inspired by the hierarchical matrix approach. HANO\nfeatures a scale-adaptive interaction range and self-attentions over a\nhierarchy of levels, enabling nested feature computation with controllable\nlinear cost and encoding/decoding of multiscale solution space. We also\nincorporate an empirical $H^1$ loss function to enhance the learning of\nhigh-frequency components. Our numerical experiments demonstrate that HANO\noutperforms state-of-the-art (SOTA) methods for representative multiscale\nproblems.\n","authors":["Xinliang Liu","Bo Xu","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.10890v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03824v2","updated":"2023-07-28T03:38:19Z","published":"2023-05-05T19:57:36Z","title":"No-Regret Constrained Bayesian Optimization of Noisy and Expensive\n  Hybrid Models using Differentiable Quantile Function Approximations","summary":"  This paper investigates the problem of efficient constrained global\noptimization of hybrid models that are a composition of a known white-box\nfunction and an expensive multi-output black-box function subject to noisy\nobservations, which often arises in real-world science and engineering\napplications. We propose a novel method, Constrained Upper Quantile Bound\n(CUQB), to solve such problems that directly exploits the composite structure\nof the objective and constraint functions that we show leads substantially\nimproved sampling efficiency. CUQB is a conceptually simple, deterministic\napproach that avoid constraint approximations used by previous methods.\nAlthough the CUQB acquisition function is not available in closed form, we\npropose a novel differentiable sample average approximation that enables it to\nbe efficiently maximized. We further derive bounds on the cumulative regret and\nconstraint violation under a non-parametric Bayesian representation of the\nblack-box function. Since these bounds depend sublinearly on the number of\niterations under some regularity assumptions, we establis bounds on the\nconvergence rate to the optimal solution of the original constrained problem.\nIn contrast to most existing methods, CUQB further incorporates a simple\ninfeasibility detection scheme, which we prove triggers in a finite number of\niterations when the original problem is infeasible (with high probability given\nthe Bayesian model). Numerical experiments on several test problems, including\nenvironmental model calibration and real-time optimization of a reactor system,\nshow that CUQB significantly outperforms traditional Bayesian optimization in\nboth constrained and unconstrained cases. Furthermore, compared to other\nstate-of-the-art methods that exploit composite structure, CUQB achieves\ncompetitive empirical performance while also providing substantially improved\ntheoretical guarantees.\n","authors":["Congwen Lu","Joel A. Paulson"],"pdf_url":"https://arxiv.org/pdf/2305.03824v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13692v2","updated":"2023-07-28T03:31:08Z","published":"2023-07-25T17:55:19Z","title":"ARB: Advanced Reasoning Benchmark for Large Language Models","summary":"  Large Language Models (LLMs) have demonstrated remarkable performance on\nvarious quantitative reasoning and knowledge benchmarks. However, many of these\nbenchmarks are losing utility as LLMs get increasingly high scores, despite not\nyet reaching expert performance in these domains. We introduce ARB, a novel\nbenchmark composed of advanced reasoning problems in multiple fields. ARB\npresents a more challenging test than prior benchmarks, featuring problems in\nmathematics, physics, biology, chemistry, and law. As a subset of ARB, we\nintroduce a challenging set of math and physics problems which require advanced\nsymbolic reasoning and domain knowledge. We evaluate recent models such as\nGPT-4 and Claude on ARB and demonstrate that current models score well below\n50% on more demanding tasks. In order to improve both automatic and assisted\nevaluation capabilities, we introduce a rubric-based evaluation approach,\nallowing GPT-4 to score its own intermediate reasoning steps. Further, we\nconduct a human evaluation of the symbolic subset of ARB, finding promising\nagreement between annotators and GPT-4 rubric evaluation scores.\n","authors":["Tomohiro Sawada","Daniel Paleka","Alexander Havrilla","Pranav Tadepalli","Paula Vidas","Alexander Kranias","John J. Nay","Kshitij Gupta","Aran Komatsuzaki"],"pdf_url":"https://arxiv.org/pdf/2307.13692v2.pdf","comment":"Submitted to NeurIPS Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2307.15273v1","updated":"2023-07-28T02:47:34Z","published":"2023-07-28T02:47:34Z","title":"Recovering high-quality FODs from a reduced number of diffusion-weighted\n  images using a model-driven deep learning architecture","summary":"  Fibre orientation distribution (FOD) reconstruction using deep learning has\nthe potential to produce accurate FODs from a reduced number of\ndiffusion-weighted images (DWIs), decreasing total imaging time. Diffusion\nacquisition invariant representations of the DWI signals are typically used as\ninput to these methods to ensure that they can be applied flexibly to data with\ndifferent b-vectors and b-values; however, this means the network cannot\ncondition its output directly on the DWI signal. In this work, we propose a\nspherical deconvolution network, a model-driven deep learning FOD\nreconstruction architecture, that ensures intermediate and output FODs produced\nby the network are consistent with the input DWI signals. Furthermore, we\nimplement a fixel classification penalty within our loss function, encouraging\nthe network to produce FODs that can subsequently be segmented into the correct\nnumber of fixels and improve downstream fixel-based analysis. Our results show\nthat the model-based deep learning architecture achieves competitive\nperformance compared to a state-of-the-art FOD super-resolution network,\nFOD-Net. Moreover, we show that the fixel classification penalty can be tuned\nto offer improved performance with respect to metrics that rely on accurately\nsegmented of FODs. Our code is publicly available at\nhttps://github.com/Jbartlett6/SDNet .\n","authors":["J Bartlett","C E Davey","L A Johnston","J Duan"],"pdf_url":"https://arxiv.org/pdf/2307.15273v1.pdf","comment":"10 pages, 7 figures, This work has been submitted to the IEEE for\n  possible publication. Copyright may be transferred without notice, after\n  which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2211.11891v2","updated":"2023-07-28T02:07:41Z","published":"2022-11-21T22:40:43Z","title":"A Bi-level Nonlinear Eigenvector Algorithm for Wasserstein Discriminant\n  Analysis","summary":"  Much like the classical Fisher linear discriminant analysis (LDA), the\nrecently proposed Wasserstein discriminant analysis (WDA) is a linear\ndimensionality reduction method that seeks a projection matrix to maximize the\ndispersion of different data classes and minimize the dispersion of same data\nclasses via a bi-level optimization. In contrast to LDA, WDA can account for\nboth global and local interconnections between data classes by using the\nunderlying principles of optimal transport. In this paper, a bi-level nonlinear\neigenvector algorithm (WDA-nepv) is presented to fully exploit the structures\nof the bi-level optimization of WDA. The inner level of WDA-nepv for computing\nthe optimal transport matrices is formulated as an eigenvector-dependent\nnonlinear eigenvalue problem (NEPv), and meanwhile, the outer level for trace\nratio optimizations is formulated as another NEPv. Both NEPvs can be computed\nefficiently under the self-consistent field (SCF) framework. WDA-nepv is\nderivative-free and surrogate-model-free when compared with existing\nalgorithms. Convergence analysis of the proposed WDA-nepv justifies the\nutilization of the SCF for solving the bi-level optimization of WDA. Numerical\nexperiments with synthetic and real-life datasets demonstrate the\nclassification accuracy and scalability of WDA-nepv.\n","authors":["Dong Min Roh","Zhaojun Bai","Ren-Cang Li"],"pdf_url":"https://arxiv.org/pdf/2211.11891v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07686v2","updated":"2023-07-28T02:04:40Z","published":"2023-07-15T02:35:51Z","title":"Creating a Dataset for High-Performance Computing Code Translation: A\n  Bridge Between HPC Fortran and C++","summary":"  In this study, we present a novel dataset for training machine learning\nmodels translating between OpenMP Fortran and C++ code. To ensure reliability\nand applicability, the dataset is initially refined using a meticulous code\nsimilarity test. The effectiveness of our dataset is assessed using both\nquantitative (CodeBLEU) and qualitative (human evaluation) methods. We\ndemonstrate how this dataset can significantly improve the translation\ncapabilities of large-scale language models, with improvements of\n$\\mathbf{\\times 5.1}$ for models with no prior coding knowledge and\n$\\mathbf{\\times 9.9}$ for models with some coding familiarity. Our work\nhighlights the potential of this dataset to advance the field of code\ntranslation for high-performance computing. The dataset is available at\nhttps://github.com/bin123apple/Fortran-CPP-HPC-code-translation-dataset\n","authors":["Bin Lei","Caiwen Ding","Le Chen","Pei-Hung Lin","Chunhua Liao"],"pdf_url":"https://arxiv.org/pdf/2307.07686v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15247v1","updated":"2023-07-28T00:59:14Z","published":"2023-07-28T00:59:14Z","title":"Is this model reliable for everyone? Testing for strong calibration","summary":"  In a well-calibrated risk prediction model, the average predicted probability\nis close to the true event rate for any given subgroup. Such models are\nreliable across heterogeneous populations and satisfy strong notions of\nalgorithmic fairness. However, the task of auditing a model for strong\ncalibration is well-known to be difficult -- particularly for machine learning\n(ML) algorithms -- due to the sheer number of potential subgroups. As such,\ncommon practice is to only assess calibration with respect to a few predefined\nsubgroups. Recent developments in goodness-of-fit testing offer potential\nsolutions but are not designed for settings with weak signal or where the\npoorly calibrated subgroup is small, as they either overly subdivide the data\nor fail to divide the data at all. We introduce a new testing procedure based\non the following insight: if we can reorder observations by their expected\nresiduals, there should be a change in the association between the predicted\nand observed residuals along this sequence if a poorly calibrated subgroup\nexists. This lets us reframe the problem of calibration testing into one of\nchangepoint detection, for which powerful methods already exist. We begin with\nintroducing a sample-splitting procedure where a portion of the data is used to\ntrain a suite of candidate models for predicting the residual, and the\nremaining data are used to perform a score-based cumulative sum (CUSUM) test.\nTo further improve power, we then extend this adaptive CUSUM test to\nincorporate cross-validation, while maintaining Type I error control under\nminimal assumptions. Compared to existing methods, the proposed procedure\nconsistently achieved higher power in simulation studies and more than doubled\nthe power when auditing a mortality risk prediction model.\n","authors":["Jean Feng","Alexej Gossmann","Romain Pirracchio","Nicholas Petrick","Gene Pennello","Berkman Sahiner"],"pdf_url":"https://arxiv.org/pdf/2307.15247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.10043v2","updated":"2023-07-28T00:57:27Z","published":"2022-09-20T23:39:52Z","title":"SynthA1c: Towards Clinically Interpretable Patient Representations for\n  Diabetes Risk Stratification","summary":"  Early diagnosis of Type 2 Diabetes Mellitus (T2DM) is crucial to enable\ntimely therapeutic interventions and lifestyle modifications. As the time\navailable for clinical office visits shortens and medical imaging data become\nmore widely available, patient image data could be used to opportunistically\nidentify patients for additional T2DM diagnostic workup by physicians. We\ninvestigated whether image-derived phenotypic data could be leveraged in\ntabular learning classifier models to predict T2DM risk in an automated fashion\nto flag high-risk patients without the need for additional blood laboratory\nmeasurements. In contrast to traditional binary classifiers, we leverage neural\nnetworks and decision tree models to represent patient data as 'SynthA1c'\nlatent variables, which mimic blood hemoglobin A1c empirical lab measurements,\nthat achieve sensitivities as high as 87.6%. To evaluate how SynthA1c models\nmay generalize to other patient populations, we introduce a novel generalizable\nmetric that uses vanilla data augmentation techniques to predict model\nperformance on input out-of-domain covariates. We show that image-derived\nphenotypes and physical examination data together can accurately predict\ndiabetes risk as a means of opportunistic risk stratification enabled by\nartificial intelligence and medical imaging. Our code is available at\nhttps://github.com/allisonjchae/DMT2RiskAssessment.\n","authors":["Michael S. Yao","Allison Chae","Matthew T. MacLean","Anurag Verma","Jeffrey Duda","James Gee","Drew A. Torigian","Daniel Rader","Charles Kahn","Walter R. Witschey","Hersh Sagreiya"],"pdf_url":"https://arxiv.org/pdf/2209.10043v2.pdf","comment":"12 pages. Accepted to PRIME MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.15245v1","updated":"2023-07-28T00:48:05Z","published":"2023-07-28T00:48:05Z","title":"A Practical Recipe for Federated Learning Under Statistical\n  Heterogeneity Experimental Design","summary":"  Federated Learning (FL) has been an area of active research in recent years.\nThere have been numerous studies in FL to make it more successful in the\npresence of data heterogeneity. However, despite the existence of many\npublications, the state of progress in the field is unknown. Many of the works\nuse inconsistent experimental settings and there are no comprehensive studies\non the effect of FL-specific experimental variables on the results and\npractical insights for a more comparable and consistent FL experimental setup.\nFurthermore, the existence of several benchmarks and confounding variables has\nfurther complicated the issue of inconsistency and ambiguity. In this work, we\npresent the first comprehensive study on the effect of FL-specific experimental\nvariables in relation to each other and performance results, bringing several\ninsights and recommendations for designing a meaningful and well-incentivized\nFL experimental setup. We further aid the community by releasing FedZoo-Bench,\nan open-source library based on PyTorch with pre-implementation of 22\nstate-of-the-art methods, and a broad set of standardized and customizable\nfeatures available at https://github.com/MMorafah/FedZoo-Bench. We also provide\na comprehensive comparison of several state-of-the-art (SOTA) methods to better\nunderstand the current state of the field and existing limitations.\n","authors":["Mahdi Morafah","Weijia Wang","Bill Lin"],"pdf_url":"https://arxiv.org/pdf/2307.15245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.07734v7","updated":"2023-07-28T00:39:37Z","published":"2022-08-16T13:09:25Z","title":"Data Augmentation is a Hyperparameter: Cherry-picked Self-Supervision\n  for Unsupervised Anomaly Detection is Creating the Illusion of Success","summary":"  Self-supervised learning (SSL) has emerged as a promising alternative to\ncreate supervisory signals to real-world problems, avoiding the extensive cost\nof manual labeling. SSL is particularly attractive for unsupervised tasks such\nas anomaly detection (AD), where labeled anomalies are rare or often\nnonexistent. A large catalog of augmentation functions has been used for\nSSL-based AD (SSAD) on image data, and recent works have reported that the type\nof augmentation has a significant impact on accuracy. Motivated by those, this\nwork sets out to put image-based SSAD under a larger lens and investigate the\nrole of data augmentation in SSAD. Through extensive experiments on 3 different\ndetector models and across 420 AD tasks, we provide comprehensive numerical and\nvisual evidences that the alignment between data augmentation and\nanomaly-generating mechanism is the key to the success of SSAD, and in the lack\nthereof, SSL may even impair accuracy. To the best of our knowledge, this is\nthe first meta-analysis on the role of data augmentation in SSAD.\n","authors":["Jaemin Yoo","Tiancheng Zhao","Leman Akoglu"],"pdf_url":"https://arxiv.org/pdf/2208.07734v7.pdf","comment":"Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2306.01874v2","updated":"2023-07-28T00:32:09Z","published":"2023-06-02T19:07:52Z","title":"SACSoN: Scalable Autonomous Control for Social Navigation","summary":"  Machine learning provides a powerful tool for building socially compliant\nrobotic systems that go beyond simple predictive models of human behavior. By\nobserving and understanding human interactions from past experiences, learning\ncan enable effective social navigation behaviors directly from data. In this\npaper, our goal is to develop methods for training policies for socially\nunobtrusive navigation, such that robots can navigate among humans in ways that\ndon't disturb human behavior. We introduce a definition for such behavior based\non the counterfactual perturbation of the human: if the robot had not intruded\ninto the space, would the human have acted in the same way? By minimizing this\ncounterfactual perturbation, we can induce robots to behave in ways that do not\nalter the natural behavior of humans in the shared space. Instantiating this\nprinciple requires training policies to minimize their effect on human\nbehavior, and this in turn requires data that allows us to model the behavior\nof humans in the presence of robots. Therefore, our approach is based on two\nkey contributions. First, we collect a large dataset where an indoor mobile\nrobot interacts with human bystanders. Second, we utilize this dataset to train\npolicies that minimize counterfactual perturbation. We provide supplementary\nvideos and make publicly available the largest-of-its-kind visual navigation\ndataset on our project page.\n","authors":["Noriaki Hirose","Dhruv Shah","Ajay Sridhar","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2306.01874v2.pdf","comment":"10 pages, 14 figures, 4 tables"}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.15640v1","updated":"2023-07-28T16:00:21Z","published":"2023-07-28T16:00:21Z","title":"CLIP Brings Better Features to Visual Aesthetics Learners","summary":"  The success of pre-training approaches on a variety of downstream tasks has\nrevitalized the field of computer vision. Image aesthetics assessment (IAA) is\none of the ideal application scenarios for such methods due to subjective and\nexpensive labeling procedure. In this work, an unified and flexible two-phase\n\\textbf{C}LIP-based \\textbf{S}emi-supervised \\textbf{K}nowledge\n\\textbf{D}istillation paradigm is proposed, namely \\textbf{\\textit{CSKD}}.\nSpecifically, we first integrate and leverage a multi-source unlabeled dataset\nto align rich features between a given visual encoder and an off-the-shelf CLIP\nimage encoder via feature alignment loss. Notably, the given visual encoder is\nnot limited by size or structure and, once well-trained, it can seamlessly\nserve as a better visual aesthetic learner for both student and teacher. In the\nsecond phase, the unlabeled data is also utilized in semi-supervised IAA\nlearning to further boost student model performance when applied in\nlatency-sensitive production scenarios. By analyzing the attention distance and\nentropy before and after feature alignment, we notice an alleviation of feature\ncollapse issue, which in turn showcase the necessity of feature alignment\ninstead of training directly based on CLIP image encoder. Extensive experiments\nindicate the superiority of CSKD, which achieves state-of-the-art performance\non multiple widely used IAA benchmarks.\n","authors":["Liwu Xu","Jinjin Xu","Yuzhe Yang","Yijie Huang","Yanchun Xie","Yaqian Li"],"pdf_url":"https://arxiv.org/pdf/2307.15640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16049v2","updated":"2023-07-28T15:13:23Z","published":"2023-05-25T13:31:37Z","title":"CN-Celeb-AV: A Multi-Genre Audio-Visual Dataset for Person Recognition","summary":"  Audio-visual person recognition (AVPR) has received extensive attention.\nHowever, most datasets used for AVPR research so far are collected in\nconstrained environments, and thus cannot reflect the true performance of AVPR\nsystems in real-world scenarios. To meet the request for research on AVPR in\nunconstrained conditions, this paper presents a multi-genre AVPR dataset\ncollected `in the wild', named CN-Celeb-AV. This dataset contains more than\n419k video segments from 1,136 persons from public media. In particular, we put\nmore emphasis on two real-world complexities: (1) data in multiple genres; (2)\nsegments with partial information. A comprehensive study was conducted to\ncompare CN-Celeb-AV with two popular public AVPR benchmark datasets, and the\nresults demonstrated that CN-Celeb-AV is more in line with real-world scenarios\nand can be regarded as a new benchmark dataset for AVPR research. The dataset\nalso involves a development set that can be used to boost the performance of\nAVPR systems in real-life situations. The dataset is free for researchers and\ncan be downloaded from http://cnceleb.org/.\n","authors":["Lantian Li","Xiaolou Li","Haoyu Jiang","Chen Chen","Ruihai Hou","Dong Wang"],"pdf_url":"https://arxiv.org/pdf/2305.16049v2.pdf","comment":"INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2307.15574v1","updated":"2023-07-28T14:14:31Z","published":"2023-07-28T14:14:31Z","title":"FleXR: A System Enabling Flexibly Distributed Extended Reality","summary":"  Extended reality (XR) applications require computationally demanding\nfunctionalities with low end-to-end latency and high throughput. To enable XR\non commodity devices, a number of distributed systems solutions enable\noffloading of XR workloads on remote servers. However, they make a priori\ndecisions regarding the offloaded functionalities based on assumptions about\noperating factors, and their benefits are restricted to specific deployment\ncontexts. To realize the benefits of offloading in various distributed\nenvironments, we present a distributed stream processing system, FleXR, which\nis specialized for real-time and interactive workloads and enables flexible\ndistributions of XR functionalities. In building FleXR, we identified and\nresolved several issues of presenting XR functionalities as distributed\npipelines. FleXR provides a framework for flexible distribution of XR pipelines\nwhile streamlining development and deployment phases. We evaluate FleXR with\nthree XR use cases in four different distribution scenarios. In the results,\nthe best-case distribution scenario shows up to 50% less end-to-end latency and\n3.9x pipeline throughput compared to alternatives.\n","authors":["Jin Heo","Ketan Bhardwaj","Ada Gavrilovska"],"pdf_url":"https://arxiv.org/pdf/2307.15574v1.pdf","comment":"11 pages, 11 figures, conference paper"},{"id":"http://arxiv.org/abs/2209.14272v2","updated":"2023-07-28T13:18:01Z","published":"2022-09-28T17:36:47Z","title":"Towards Multimodal Prediction of Spontaneous Humour: A Novel Dataset and\n  First Results","summary":"  Humour is a substantial element of human affect and cognition. Its automatic\nunderstanding can facilitate a more naturalistic human-device interaction and\nthe humanisation of artificial intelligence. Current methods of humour\ndetection are solely based on staged data making them inadequate for\n'real-world' applications. We address this deficiency by introducing the novel\nPassau-Spontaneous Football Coach Humour (Passau-SFCH) dataset, comprising of\nabout 11 hours of recordings. The Passau-SFCH dataset is annotated for the\npresence of humour and its dimensions (sentiment and direction) as proposed in\nMartin's Humor Style Questionnaire. We conduct a series of experiments,\nemploying pretrained Transformers, convolutional neural networks, and\nexpert-designed features. The performance of each modality (text, audio, video)\nfor spontaneous humour recognition is analysed and their complementarity is\ninvestigated. Our findings suggest that for the automatic analysis of humour\nand its sentiment, facial expressions are most promising, while humour\ndirection can be best modelled via text-based features. The results reveal\nconsiderable differences among various subjects, highlighting the individuality\nof humour usage and style. Further, we observe that a decision-level fusion\nyields the best recognition result. Finally, we make our code publicly\navailable at https://www.github.com/EIHW/passau-sfch. The Passau-SFCH dataset\nis available upon request.\n","authors":["Lukas Christ","Shahin Amiriparian","Alexander Kathan","Niklas Müller","Andreas König","Björn W. Schuller"],"pdf_url":"https://arxiv.org/pdf/2209.14272v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible (Major Revision)"},{"id":"http://arxiv.org/abs/2307.15413v1","updated":"2023-07-28T09:06:50Z","published":"2023-07-28T09:06:50Z","title":"Improving Social Media Popularity Prediction with Multiple Post\n  Dependencies","summary":"  Social Media Popularity Prediction has drawn a lot of attention because of\nits profound impact on many different applications, such as recommendation\nsystems and multimedia advertising. Despite recent efforts to leverage the\ncontent of social media posts to improve prediction accuracy, many existing\nmodels fail to fully exploit the multiple dependencies between posts, which are\nimportant to comprehensively extract content information from posts. To tackle\nthis problem, we propose a novel prediction framework named Dependency-aware\nSequence Network (DSN) that exploits both intra- and inter-post dependencies.\nFor intra-post dependency, DSN adopts a multimodal feature extractor with an\nefficient fine-tuning strategy to obtain task-specific representations from\nimages and textual information of posts. For inter-post dependency, DSN uses a\nhierarchical information propagation method to learn category representations\nthat could better describe the difference between posts. DSN also exploits\nrecurrent networks with a series of gating layers for more flexible local\ntemporal processing abilities and multi-head attention for long-term\ndependencies. The experimental results on the Social Media Popularity Dataset\ndemonstrate the superiority of our method compared to existing state-of-the-art\nmodels.\n","authors":["Zhizhen Zhang","Xiaohui Xie","Mengyu Yang","Ye Tian","Yong Jiang","Yong Cui"],"pdf_url":"https://arxiv.org/pdf/2307.15413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15317v1","updated":"2023-07-28T05:32:56Z","published":"2023-07-28T05:32:56Z","title":"DiffKendall: A Novel Approach for Few-Shot Learning with Differentiable\n  Kendall's Rank Correlation","summary":"  Few-shot learning aims to adapt models trained on the base dataset to novel\ntasks where the categories are not seen by the model before. This often leads\nto a relatively uniform distribution of feature values across channels on novel\nclasses, posing challenges in determining channel importance for novel tasks.\nStandard few-shot learning methods employ geometric similarity metrics such as\ncosine similarity and negative Euclidean distance to gauge the semantic\nrelatedness between two features. However, features with high geometric\nsimilarities may carry distinct semantics, especially in the context of\nfew-shot learning. In this paper, we demonstrate that the importance ranking of\nfeature channels is a more reliable indicator for few-shot learning than\ngeometric similarity metrics. We observe that replacing the geometric\nsimilarity metric with Kendall's rank correlation only during inference is able\nto improve the performance of few-shot learning across a wide range of datasets\nwith different domains. Furthermore, we propose a carefully designed\ndifferentiable loss for meta-training to address the non-differentiability\nissue of Kendall's rank correlation. Extensive experiments demonstrate that the\nproposed rank-correlation-based approach substantially enhances few-shot\nlearning performance.\n","authors":["Kaipeng Zheng","Huishuai Zhang","Weiran Huang"],"pdf_url":"https://arxiv.org/pdf/2307.15317v1.pdf","comment":null}]}}
\ No newline at end of file
diff --git a/favicon.ico b/favicon.ico
new file mode 100644
index 00000000..7f5166c7
Binary files /dev/null and b/favicon.ico differ
diff --git a/index.css b/index.css
new file mode 100644
index 00000000..9ded9d94
--- /dev/null
+++ b/index.css
@@ -0,0 +1,355 @@
+:root {
+    /* Palette: Nord (https://www.nordtheme.com)*/
+    --nord00: #2e3440;
+    --nord01: #3b4252;
+    --nord02: #434c5e;
+    --nord03: #4c566a;
+    --nord04: #d8dee9;
+    --nord05: #e5e9f0;
+    --nord06: #eceff4;
+    --nord07: #8fbcbb;
+    --nord08: #88c0d0;
+    --nord09: #81a1c1;
+    --nord0A: #5e81ac;
+    --nord0B: #bf616a;
+    --nord0C: #d08770;
+    --nord0D: #ebcb8b;
+    --nord0E: #a3be8c;
+    --nord0F: #b48ead;
+
+
+    /* Typograph */
+    --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue",
+    sans-serif;
+    --font-size-scaler: 62.5%;
+    --font-size-m: 1.6rem;
+    --font-size-s: 1.4rem;
+
+    /* Components */
+    --body-color: var(--nord06);
+    --body-bg: var(--nord00);
+
+    --header-title: var(--nord06);
+    --header-container: var(--nord00);
+    --header-title-preffix: var(--nord0F);
+
+    --chip-font: var(--nord08);
+    --chip-color: var(--nord0B);
+
+    --icons: var(--nord06);
+    --icons-hover: var(--nord0F);
+
+    --day-container: var(--nord01);
+    --date: var(--nord09);
+
+    --summary: var(--nord0E);
+    --summary-hover: var(--nord0F);
+
+    --details-open: var(--nord02);
+    --details-content: var(--nord05);
+    --details-a: var(--nord07);
+    --details-a-hover: var(--nord0F);
+
+    --highlight-title: var(--nord0B);
+    --highlight-author: var(--nord0B);
+
+    --article-summary-hover-color: var(--nord0D);
+    --article-summary-color: var(--nord04);
+
+    --article-title-color: var(--nord05);
+    --article-title-hover-color: var(--nord0E);
+
+    --accordion-content-rail-color: var(--nord01);
+    --accordion-content-hover-rail-color: var(--nord0D);
+    --accordion-title-marker-color: var(--nord01);
+    --accordion-title-hover-marker-color: var(--nord0E);
+
+    --footer-color: var(--nord04);
+    --footer-link-hover-color: var(--nord0D);
+}
+
+[data-theme="light"] {
+    /* Theme design */
+
+    --color-primary: var(--nord07);
+    --color-primary-second: var(--nord00);
+    --color-info: var(--nord0A);
+    --color-success: var(--nord0E);
+    --color-warning: var(--nord0C);
+    --color-danger: var(--nord0B);
+
+    --color-text: var(--nord00);
+    --color-hover: var(--nord0D);
+    --color-shadow: var(--nord03);
+
+    --color-primary-h: var(--nord09);
+    --color-primary-s: var(--nord08);
+    --color-primary-l: var(--nord07);
+
+    --color-contrast-higher-h: var(--nord01);
+    --color-contrast-higher-l: var(--nord02);
+    --color-contrast-higher-s: var(--nord03);
+
+    --color-content: white;
+
+    --background: var(--nord06);
+    --background-content: var(--nord05);
+    --background-color: var(--nord04);
+
+    /* Components */
+
+    --chip-font: var(--nord06);
+    --chip-color: var(--nord09);
+
+    --body-color: var(--background-color);
+    --body-bg: var(--background);
+
+    --header-title: var(--color-shadow);
+    --header-container: var(--background);
+    --header-title-preffix: var(--color-primary-h);
+
+    --icons: var(--color-shadow);
+    --icons-hover: var(--color-hover);
+
+    --day-container: var(--background-content);
+    --date: var(--color-primary-l);
+
+    --summary: var(--color-info);
+    --summary-hover: var(--color-success);
+
+    --details-open: var(--color-content);
+    --details-content: var(--color-text);
+    --details-a: var(--color-primary-h);
+    --details-a-hover: var(--color-hover);
+
+    --highlight-title: var(--color-danger);
+    --highlight-author: var(--color-warning);
+
+    --article-summary-color: var(--color-text);
+    --article-summary-hover-color: var(--color-primary-s);
+
+    --article-title-color: var(--color-primary);
+    --article-title-hover-color: var(--color-success);
+
+    --accordion-content-rail-color: var(--color-warning);
+    --accordion-content-hover-rail-color: var(--color-warning);
+    --accordion-title-marker-color: var(--color-success);
+    --accordion-title-hover-marker-color: var(--color-success);
+
+    --footer-color: var(--color-text);
+    --footer-link-hover-color: var(--color-hover);
+}
+
+html {
+    font-size: var(--font-size-scaler);
+}
+
+body {
+    background-color: var(--body-bg);
+    font-family: var(--font-family-default);
+    color: var(--body-color);
+    margin: 0;
+    padding-top: 16px;
+    display: grid;
+}
+
+.header-container {
+    width: 90%;
+    max-width: 1200px;
+    background: var(--header-container);
+    margin: 0 auto;
+}
+
+.header-title {
+    font-size: 32px;
+    font-weight: bold;
+    color: var(--header-title);
+    margin: 0;
+    padding-bottom: 14px;
+}
+
+.header-title-preffix {
+    color: var(--header-title-preffix);
+}
+
+.icons {
+    color: var(--icons);
+    padding-bottom: 16px;
+}
+
+.icons a {
+    color: var(--icons);
+    text-decoration: none;
+}
+
+.icons a:hover {
+    color: var(--icons-hover);
+}
+
+.day-container {
+    padding: 16px 16px 16px 16px;
+    background: var(--day-container);
+    width: 90%;
+    max-width: 1200px;
+    margin: 0 auto;
+    margin-bottom: 8px;
+    border-radius: 10px;
+}
+
+.date {
+    font-size: 24px;
+    font-weight: 700;
+    margin: 0;
+    color: var(--date);
+}
+
+p {
+    margin: 0;
+}
+
+summary {
+    font-weight: 600;
+    color: var(--summary);
+}
+
+summary:hover {
+    text-decoration: underline;
+    cursor: pointer;
+    color: var(--summary-hover);
+}
+
+details {
+    --border-color: transparent;
+
+    padding: 2px 4px;
+    font-size: 20px;
+    border: 1px solid var(--border-color);
+    border-radius: 4px;
+}
+
+details[open] {
+    background-color: var(--details-open);
+    margin-bottom: 8px;
+}
+
+.details-content {
+    padding: 12px 3px;
+    gap: 16px;
+    color: var(--details-content);
+}
+
+details a {
+    color: var(--details-a);
+}
+
+details a:hover {
+    color: var(--details-a-hover);
+}
+
+footer {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    justify-content: space-between;
+}
+
+.description {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    text-align: center;
+}
+
+.highlight-author {
+    color: var(--highlight-author);
+    font-weight: bold;
+}
+
+.highlight-title {
+    color: var(--highlight-title);
+    font-weight: bold;
+}
+
+.channel-description {
+    text-align: center;
+    font-size: var(--font-size-scaler);
+}
+
+.article-summary-link {
+    color: var(--article-summary-color);
+    font-size: var(--font-size-s);
+    text-decoration: none;
+}
+
+.article-summary-link:hover {
+    color: var(--article-summary-hover-color);
+    --accordion-content-rail-color: var(--accordion-content-hover-rail-color);
+}
+
+.article-summary-box-outer {
+    display: block;
+    padding: 4px 8px 8px 4px;
+}
+
+.article-summary-box-inner {
+    padding-left: 8px;
+    border-left: 1px solid var(--accordion-content-rail-color);
+    font-size: var(--font-size-m);
+}
+
+.article-expander {
+    padding: 10px 4px;
+    border-radius: 4px;
+}
+
+.article-authors {
+    font-size: var(--font-size-m);
+    padding: 0.25em 1em;
+}
+
+.article-authors a {
+    text-decoration: none;
+}
+
+.article-expander-title {
+    font-size: var(--font-size-m);
+    font-weight: 600;
+}
+
+.article-expander-title:hover {
+    cursor: pointer;
+}
+
+.article-expander-title::marker {
+    color: var(--accordion-title-marker-color);
+}
+
+.article-expander-title:hover::marker {
+    color: var(--accordion-title-hover-marker-color);
+}
+
+/* for switcher */
+.theme-switch {
+    display: inline-block;
+    position: relative;
+}
+
+.theme-switch input {
+    display: none;
+}
+
+/* chip */
+.chip {
+    font-size: 90%;
+    align-items: center;
+    color: var(--chip-font);
+    background: var(--chip-color);
+    border-radius: 5rem;
+    display: inline-flex;
+    padding: .2rem .4rem;
+    vertical-align: middle;
+}
\ No newline at end of file
diff --git a/index.html b/index.html
new file mode 100644
index 00000000..6360688e
--- /dev/null
+++ b/index.html
@@ -0,0 +1,61375 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <title>MyArxiv</title>
+    <meta charset="utf-8"/>
+    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
+    <meta name="robots" content="noindex, nofollow"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1"/>
+    <link rel="shortcut icon" type="image/x-icon" href="favicon.ico"/>
+    <link href="index.css" rel="stylesheet"/>
+    <link href="https://cdn.jsdelivr.net/npm/remixicon@2.5.0/fonts/remixicon.css" rel="stylesheet">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.js"
+            integrity="sha384-z1fJDqw8ZApjGO3/unPWUPsIymfsJmyrDVWC8Tv/a1HeOtGmkwNd/7xUS0Xcnvsx"
+            crossorigin="anonymous"></script>
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/contrib/auto-render.min.js"
+            integrity="sha384-+XBljXPPiv+OzfbB3cVmLHf4hdUFHlWNZN5spNQ7rmHTXpd7WvJum6fIACpNNfIR"
+            crossorigin="anonymous"></script>
+    <script>
+        document.addEventListener("DOMContentLoaded", function () {
+            renderMathInElement(document.body, {
+                // customised options
+                // • auto-render specific keys, e.g.:
+                delimiters: [
+                    {left: '$$', right: '$$', display: true},
+                    {left: '$', right: '$', display: false},
+                    {left: '\\(', right: '\\)', display: false},
+                    {left: '\\[', right: '\\]', display: true},
+                    {left: "\\begin{equation}", right: "\\end{equation}", display: true},
+                    {left: "\\begin{align}", right: "\\end{align}", display: true},
+                    {left: "\\begin{alignat}", right: "\\end{alignat}", display: true},
+                    {left: "\\begin{gather}", right: "\\end{gather}", display: true},
+                    {left: "\\begin{CD}", right: "\\end{CD}", display: true},
+                ],
+                // • rendering keys, e.g.:
+                throwOnError: false
+            });
+        });
+    </script>
+</head>
+
+<body>
+<section class="header-container">
+    <div style="display:flex; justify-content:space-between; align-items:flex-end;">
+        <div>
+            <div class="header-title">
+                MyArxiv
+            </div>
+        </div>
+
+        <div class=icons>
+            <label class="theme-switch" for="checkbox">
+                <input type="checkbox" id="checkbox"/>
+                <i id="theme-icon" class="ri-moon-line" style="font-size: 32px" rel="noopener noreferrer"></i>
+            </label>
+        </div>
+    </div>
+</section>
+
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-07-28T00:00:00Z">2023-07-28</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">40</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty in Natural Language Generation: From Theory to Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15703v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15703v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joris Baan, Nico Daheim, Evgenia Ilia, Dennis Ulmer, Haau-Sing Li, Raquel Fernández, Barbara Plank, Rico Sennrich, Chrysoula Zerva, Wilker Aziz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances of powerful Language Models have allowed Natural Language
+Generation (NLG) to emerge as an important technology that can not only perform
+traditional tasks like summarisation or translation, but also serve as a
+natural language interface to a variety of applications. As such, it is crucial
+that NLG systems are trustworthy and reliable, for example by indicating when
+they are likely to be wrong; and supporting multiple views, backgrounds and
+writing styles -- reflecting diverse human sub-populations. In this paper, we
+argue that a principled treatment of uncertainty can assist in creating systems
+and evaluation protocols better aligned with these goals. We first present the
+fundamental theory, frameworks and vocabulary required to represent
+uncertainty. We then characterise the main sources of uncertainty in NLG from a
+linguistic perspective, and propose a two-dimensional taxonomy that is more
+informative and faithful than the popular aleatoric/epistemic dichotomy.
+Finally, we move from theory to applications and highlight exciting research
+directions that exploit uncertainty to power decoding, controllable generation,
+self-assessment, selective answering, active learning and more.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Data Generation in Vision-and-Language Navigation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15644v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15644v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zun Wang, Jialu Li, Yicong Hong, Yi Wang, Qi Wu, Mohit Bansal, Stephen Gould, Hao Tan, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research in language-guided visual navigation has demonstrated a
+significant demand for the diversity of traversable environments and the
+quantity of supervision for training generalizable agents. To tackle the common
+data scarcity issue in existing vision-and-language navigation datasets, we
+propose an effective paradigm for generating large-scale data for learning,
+which applies 1200+ photo-realistic environments from HM3D and Gibson datasets
+and synthesizes 4.9 million instruction trajectory pairs using fully-accessible
+resources on the web. Importantly, we investigate the influence of each
+component in this paradigm on the agent's performance and study how to
+adequately apply the augmented data to pre-train and fine-tune an agent. Thanks
+to our large-scale dataset, the performance of an existing agent can be pushed
+up (+11% absolute with regard to previous SoTA) to a significantly new best of
+80% single-run success rate on the R2R test split by simple imitation learning.
+The long-lasting generalization gap between navigating in seen and unseen
+environments is also reduced to less than 1% (versus 8% in the previous best
+method). Moreover, our paradigm also facilitates different models to achieve
+new state-of-the-art navigation results on CVDN, REVERIE, and R2R in continuous
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Distortion-free Watermarks for Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15593v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15593v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohith Kuditipudi, John Thickstun, Tatsunori Hashimoto, Percy Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a methodology for planting watermarks in text from an
+autoregressive language model that are robust to perturbations without changing
+the distribution over text up to a certain maximum generation budget. We
+generate watermarked text by mapping a sequence of random numbers -- which we
+compute using a randomized watermark key -- to a sample from the language
+model. To detect watermarked text, any party who knows the key can align the
+text to the random number sequence. We instantiate our watermark methodology
+with two sampling schemes: inverse transform sampling and exponential minimum
+sampling. We apply these watermarks to three language models -- OPT-1.3B,
+LLaMA-7B and Alpaca-7B -- to experimentally validate their statistical power
+and robustness to various paraphrasing attacks. Notably, for both the OPT-1.3B
+and LLaMA-7B models, we find we can reliably detect watermarked text ($p \leq
+0.01$) from $35$ tokens even after corrupting between $40$-$50$\% of the tokens
+via random edits (i.e., substitutions, insertions or deletions). For the
+Alpaca-7B model, we conduct a case study on the feasibility of watermarking
+responses to typical user instructions. Due to the lower entropy of the
+responses, detection is more difficult: around $25\%$ of the responses -- whose
+median length is around $100$ tokens -- are detectable with $p \leq 0.01$, and
+the watermark is also less robust to certain automated paraphrasing attacks we
+implement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When to generate hedges in peer-tutoring interactions <span class="chip">SIGDIAL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15582v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15582v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alafate Abulimiti, Chloé Clavel, Justine Cassell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the application of machine learning techniques to predict
+where hedging occurs in peer-tutoring interactions. The study uses a
+naturalistic face-to-face dataset annotated for natural language turns,
+conversational strategies, tutoring strategies, and nonverbal behaviours. These
+elements are processed into a vector representation of the previous turns,
+which serves as input to several machine learning models. Results show that
+embedding layers, that capture the semantic information of the previous turns,
+significantly improves the model's performance. Additionally, the study
+provides insights into the importance of various features, such as
+interpersonal rapport and nonverbal behaviours, in predicting hedges by using
+Shapley values for feature explanation. We discover that the eye gaze of both
+the tutor and the tutee has a significant impact on hedge prediction. We
+further validate this observation through a follow-up ablation study.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of the 16th Annual Conference ub Discourse and
+  Dialogue (SIGDIAL). Sept 11-15, Prague Czechia</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ All-for-One and One-For-All: Deep learning-based feature fusion for
+  Synthetic Speech Detection <span class="chip">ECML-PKDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniele Mari, Davide Salvi, Paolo Bestagini, Simone Milani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in deep learning and computer vision have made the synthesis
+and counterfeiting of multimedia content more accessible than ever, leading to
+possible threats and dangers from malicious users. In the audio field, we are
+witnessing the growth of speech deepfake generation techniques, which solicit
+the development of synthetic speech detection algorithms to counter possible
+mischievous uses such as frauds or identity thefts. In this paper, we consider
+three different feature sets proposed in the literature for the synthetic
+speech detection task and present a model that fuses them, achieving overall
+better performances with respect to the state-of-the-art solutions. The system
+was tested on different scenarios and datasets to prove its robustness to
+anti-forensic attacks and its generalization capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECML-PKDD 2023 Workshop "Deep Learning and Multimedia
+  Forensics. Combating fake media and misinformation"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 'What are you referring to?' Evaluating the Ability of Multi-Modal
+  Dialogue Models to Process Clarificational Exchanges <span class="chip">SIGDIAL'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15554v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15554v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Chiyah-Garcia, Alessandro Suglia, Arash Eshghi, Helen Hastie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Referential ambiguities arise in dialogue when a referring expression does
+not uniquely identify the intended referent for the addressee. Addressees
+usually detect such ambiguities immediately and work with the speaker to repair
+it using meta-communicative, Clarificational Exchanges (CE): a Clarification
+Request (CR) and a response. Here, we argue that the ability to generate and
+respond to CRs imposes specific constraints on the architecture and objective
+functions of multi-modal, visually grounded dialogue models. We use the SIMMC
+2.0 dataset to evaluate the ability of different state-of-the-art model
+architectures to process CEs, with a metric that probes the contextual updates
+that arise from them in the model. We find that language-based models are able
+to encode simple multi-modal semantic information and process some CEs,
+excelling with those related to the dialogue history, whilst multi-modal models
+can use additional learning objectives to obtain disentangled object
+representations, which become crucial to handle complex referential ambiguities
+across modalities overall.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at SIGDIAL'23 (upcoming). Repository with code and
+  experiments available at https://github.com/JChiyah/what-are-you-referring-to</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Oracle Computability and Turing Reducibility in the Calculus of
+  Inductive Constructions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15543v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15543v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yannick Forster, Dominik Kirst, Niklas Mück
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop synthetic notions of oracle computability and Turing reducibility
+in the Calculus of Inductive Constructions (CIC), the constructive type theory
+underlying the Coq proof assistant. As usual in synthetic approaches, we employ
+a definition of oracle computations based on meta-level functions rather than
+object-level models of computation, relying on the fact that in constructive
+systems such as CIC all definable functions are computable by construction.
+Such an approach lends itself well to machine-checked proofs, which we carry
+out in Coq.
+  There is a tension in finding a good synthetic rendering of the higher-order
+notion of oracle computability. On the one hand, it has to be informative
+enough to prove central results, ensuring that all notions are faithfully
+captured. On the other hand, it has to be restricted enough to benefit from
+axioms for synthetic computability, which usually concern first-order objects.
+Drawing inspiration from a definition by Andrej Bauer based on continuous
+functions in the effective topos, we use a notion of sequential continuity to
+characterise valid oracle computations.
+  As main technical results, we show that Turing reducibility forms an upper
+semilattice, transports decidability, and is strictly more expressive than
+truth-table reducibility, and prove that whenever both a predicate $p$ and its
+complement are semi-decidable relative to an oracle $q$, then $p$
+Turing-reduces to $q$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Road to Quality is Paved with Good Revisions: A Detailed Evaluation
+  Methodology for Revision Policies in Incremental Sequence Labelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15508v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15508v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brielen Madureira, Patrick Kahardipraja, David Schlangen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incremental dialogue model components produce a sequence of output prefixes
+based on incoming input. Mistakes can occur due to local ambiguities or to
+wrong hypotheses, making the ability to revise past outputs a desirable
+property that can be governed by a policy. In this work, we formalise and
+characterise edits and revisions in incremental sequence labelling and propose
+metrics to evaluate revision policies. We then apply our methodology to profile
+the incremental behaviour of three Transformer-based encoders in various tasks,
+paving the road for better revision policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at SIGdial 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Format Consistency for Instruction Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15504v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15504v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shihao Liang, Kunlun Zhu, Runchu Tian, Yujia Qin, Huadong Wang, Xin Cong, Zhiyuan Liu, Xiaojiang Liu, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction tuning has emerged as a promising approach to enhancing large
+language models in following human instructions. It is shown that increasing
+the diversity and number of instructions in the training data can consistently
+enhance generalization performance, which facilitates a recent endeavor to
+collect various instructions and integrate existing instruction tuning datasets
+into larger collections. However, different users have their unique ways of
+expressing instructions, and there often exist variations across different
+datasets in the instruction styles and formats, i.e., format inconsistency. In
+this work, we study how format inconsistency may impact the performance of
+instruction tuning. We propose a framework called "Unified Instruction Tuning"
+(UIT), which calls OpenAI APIs for automatic format transfer among different
+instruction tuning datasets. We show that UIT successfully improves the
+generalization performance on unseen instructions, which highlights the
+importance of format consistency for instruction tuning. To make the UIT
+framework more practical, we further propose a novel perplexity-based denoising
+method to reduce the noise of automatic format transfer. We also train a
+smaller offline model that achieves comparable format transfer capability than
+OpenAI APIs to reduce costs in practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ETHER: Aligning Emergent Communication for Hindsight Experience Replay 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15494v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15494v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Denamganaï, Daniel Hernandez, Ozan Vardal, Sondess Missaoui, James Alfred Walker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural language instruction following is paramount to enable collaboration
+between artificial agents and human beings. Natural language-conditioned
+reinforcement learning (RL) agents have shown how natural languages'
+properties, such as compositionality, can provide a strong inductive bias to
+learn complex policies. Previous architectures like HIGhER combine the benefit
+of language-conditioning with Hindsight Experience Replay (HER) to deal with
+sparse rewards environments. Yet, like HER, HIGhER relies on an oracle
+predicate function to provide a feedback signal highlighting which linguistic
+description is valid for which state. This reliance on an oracle limits its
+application. Additionally, HIGhER only leverages the linguistic information
+contained in successful RL trajectories, thus hurting its final performance and
+data-efficiency. Without early successful trajectories, HIGhER is no better
+than DQN upon which it is built. In this paper, we propose the Emergent Textual
+Hindsight Experience Replay (ETHER) agent, which builds on HIGhER and addresses
+both of its limitations by means of (i) a discriminative visual referential
+game, commonly studied in the subfield of Emergent Communication (EC), used
+here as an unsupervised auxiliary task and (ii) a semantic grounding scheme to
+align the emergent language with the natural language of the
+instruction-following benchmark. We show that the referential game's agents
+make an artificial language emerge that is aligned with the natural-like
+language used to describe goals in the BabyAI benchmark and that it is
+expressive enough so as to also describe unsuccessful RL trajectories and thus
+provide feedback to the RL agent to leverage the linguistic, structured
+information contained in all trajectories. Our work shows that EC is a viable
+unsupervised auxiliary task for RL and provides missing pieces to make HER more
+widely applicable.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The timing bottleneck: Why timing and overlap are mission-critical for
+  conversational user interfaces, speech recognition and dialogue systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15493v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15493v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Liesenfeld, Alianda Lopez, Mark Dingemanse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech recognition systems are a key intermediary in voice-driven
+human-computer interaction. Although speech recognition works well for pristine
+monologic audio, real-life use cases in open-ended interactive settings still
+present many challenges. We argue that timing is mission-critical for dialogue
+systems, and evaluate 5 major commercial ASR systems for their conversational
+and multilingual support. We find that word error rates for natural
+conversational data in 6 languages remain abysmal, and that overlap remains a
+key challenge (study 1). This impacts especially the recognition of
+conversational words (study 2), and in turn has dire consequences for
+downstream intent recognition (study 3). Our findings help to evaluate the
+current state of conversational ASR, contribute towards multidimensional error
+analysis and evaluation, and identify phenomena that need most attention on the
+way to build robust interactive speech technologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Minimally-Supervised Speech Synthesis with Conditional Diffusion Model
+  and Language Model: A Comparative Study of Semantic Coding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15484v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15484v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunyu Qiang, Hao Li, Hao Ni, He Qu, Ruibo Fu, Tao Wang, Longbiao Wang, Jianwu Dang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been a growing interest in text-to-speech (TTS) methods
+that can be trained with minimal supervision by combining two types of discrete
+speech representations and using two sequence-to-sequence tasks to decouple
+TTS. To address the challenges associated with high dimensionality and waveform
+distortion in discrete representations, we propose Diff-LM-Speech, which models
+semantic embeddings into mel-spectrogram based on diffusion models and
+introduces a prompt encoder structure based on variational autoencoders and
+prosody bottlenecks to improve prompt representation capabilities.
+Autoregressive language models often suffer from missing and repeated words,
+while non-autoregressive frameworks face expression averaging problems due to
+duration prediction models. To address these issues, we propose
+Tetra-Diff-Speech, which designs a duration diffusion model to achieve diverse
+prosodic expressions. While we expect the information content of semantic
+coding to be between that of text and acoustic coding, existing models extract
+semantic coding with a lot of redundant information and dimensionality
+explosion. To verify that semantic coding is not necessary, we propose
+Tri-Diff-Speech. Experimental results show that our proposed methods outperform
+baseline methods. We provide a website with audio samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Modal Concept Learning and Inference for Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15460v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15460v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Zhang, Ce Zhang, Yushun Tang, Zhihai He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale pre-trained Vision-Language Models (VLMs), such as CLIP,
+establish the correlation between texts and images, achieving remarkable
+success on various downstream tasks with fine-tuning. In existing fine-tuning
+methods, the class-specific text description is matched against the whole
+image. We recognize that this whole image matching is not effective since
+images from the same class often contain a set of different semantic objects,
+and an object further consists of a set of semantic parts or concepts.
+Individual semantic parts or concepts may appear in image samples from
+different classes. To address this issue, in this paper, we develop a new
+method called cross-model concept learning and inference (CCLI). Using the
+powerful text-image correlation capability of CLIP, our method automatically
+learns a large set of distinctive visual concepts from images using a set of
+semantic text concepts. Based on these visual concepts, we construct a
+discriminative representation of images and learn a concept inference network
+to perform downstream image classification tasks, such as few-shot learning and
+domain generalization. Extensive experimental results demonstrate that our CCLI
+method is able to improve the performance upon the current state-of-the-art
+methods by large margins, for example, by up to 8.0% improvement on few-shot
+learning and by up to 1.3% for domain generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Trie-NLG: Trie Context Augmentation to Improve Personalized Query
+  Auto-Completion for Short and Unseen Prefixes <span class="chip">ECML-PKDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15455v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15455v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaushal Kumar Maurya, Maunendra Sankar Desarkar, Manish Gupta, Puneet Agrawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Query auto-completion (QAC) aims at suggesting plausible completions for a
+given query prefix. Traditionally, QAC systems have leveraged tries curated
+from historical query logs to suggest most popular completions. In this
+context, there are two specific scenarios that are difficult to handle for any
+QAC system: short prefixes (which are inherently ambiguous) and unseen
+prefixes. Recently, personalized Natural Language Generation (NLG) models have
+been proposed to leverage previous session queries as context for addressing
+these two challenges. However, such NLG models suffer from two drawbacks: (1)
+some of the previous session queries could be noisy and irrelevant to the user
+intent for the current prefix, and (2) NLG models cannot directly incorporate
+historical query popularity. This motivates us to propose a novel NLG model for
+QAC, Trie-NLG, which jointly leverages popularity signals from trie and
+personalization signals from previous session queries. We train the Trie-NLG
+model by augmenting the prefix with rich context comprising of recent session
+queries and top trie completions. This simple modeling approach overcomes the
+limitations of trie-based and NLG-based approaches and leads to
+state-of-the-art performance. We evaluate the Trie-NLG model using two large
+QAC datasets. On average, our model achieves huge ~57% and ~14% boost in MRR
+over the popular trie-based lookup and the strong BART-based baseline methods,
+respectively. We make our code publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Journal Track of ECML-PKDD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Probabilistic Programming to Complexity-based Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15453v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15453v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giovanni Sileno, Jean-Louis Dessalles
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paper presents the main characteristics and a preliminary implementation
+of a novel computational framework named CompLog. Inspired by probabilistic
+programming systems like ProbLog, CompLog builds upon the inferential
+mechanisms proposed by Simplicity Theory, relying on the computation of two
+Kolmogorov complexities (here implemented as min-path searches via ASP
+programs) rather than probabilistic inference. The proposed system enables
+users to compute ex-post and ex-ante measures of unexpectedness of a certain
+situation, mapping respectively to posterior and prior subjective
+probabilities. The computation is based on the specification of world and
+mental models by means of causal and descriptive relations between predicates
+weighted by complexity. The paper illustrates a few examples of application:
+generating relevant descriptions, and providing alternative approaches to
+disjunction and to negation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CFN-ESA: A Cross-Modal Fusion Network with Emotion-Shift Awareness for
+  Dialogue Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15432v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15432v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiang Li, Yingjian Liu, Xiaoping Wang, Zhigang Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Emotion Recognition in Conversation (ERC) has garnered growing
+attention from research communities in various fields. In this paper, we
+propose a cross-modal fusion network with emotion-shift awareness (CFN-ESA) for
+ERC. Extant approaches employ each modality equally without distinguishing the
+amount of emotional information, rendering it hard to adequately extract
+complementary and associative information from multimodal data. To cope with
+this problem, in CFN-ESA, textual modalities are treated as the primary source
+of emotional information, while visual and acoustic modalities are taken as the
+secondary sources. Besides, most multimodal ERC models ignore emotion-shift
+information and overfocus on contextual information, leading to the failure of
+emotion recognition under emotion-shift scenario. We elaborate an emotion-shift
+module to address this challenge. CFN-ESA mainly consists of the unimodal
+encoder (RUME), cross-modal encoder (ACME), and emotion-shift module (LESM).
+RUME is applied to extract conversation-level contextual emotional cues while
+pulling together the data distributions between modalities; ACME is utilized to
+perform multimodal interaction centered on textual modality; LESM is used to
+model emotion shift and capture related information, thereby guide the learning
+of the main task. Experimental results demonstrate that CFN-ESA can effectively
+promote performance for ERC and remarkably outperform the state-of-the-art
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Critical <span class="highlight-title">Review</span> of Large Language Models: Sensitivity, Bias, and the
+  Path Toward Specialized AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arash Hajikhani, Carolyn Cole
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper examines the comparative effectiveness of a specialized compiled
+language model and a general-purpose model like OpenAI's GPT-3.5 in detecting
+SDGs within text data. It presents a critical review of Large Language Models
+(LLMs), addressing challenges related to bias and sensitivity. The necessity of
+specialized training for precise, unbiased analysis is underlined. A case study
+using a company descriptions dataset offers insight into the differences
+between the GPT-3.5 and the specialized SDG detection model. While GPT-3.5
+boasts broader coverage, it may identify SDGs with limited relevance to the
+companies' activities. In contrast, the specialized model zeroes in on highly
+pertinent SDGs. The importance of thoughtful model selection is emphasized,
+taking into account task requirements, cost, complexity, and transparency.
+Despite the versatility of LLMs, the use of specialized models is suggested for
+tasks demanding precision and accuracy. The study concludes by encouraging
+further research to find a balance between the capabilities of LLMs and the
+need for domain-specific expertise and interpretability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 6 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Social Media Popularity Prediction with Multiple Post
+  Dependencies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhizhen Zhang, Xiaohui Xie, Mengyu Yang, Ye Tian, Yong Jiang, Yong Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social Media Popularity Prediction has drawn a lot of attention because of
+its profound impact on many different applications, such as recommendation
+systems and multimedia advertising. Despite recent efforts to leverage the
+content of social media posts to improve prediction accuracy, many existing
+models fail to fully exploit the multiple dependencies between posts, which are
+important to comprehensively extract content information from posts. To tackle
+this problem, we propose a novel prediction framework named Dependency-aware
+Sequence Network (DSN) that exploits both intra- and inter-post dependencies.
+For intra-post dependency, DSN adopts a multimodal feature extractor with an
+efficient fine-tuning strategy to obtain task-specific representations from
+images and textual information of posts. For inter-post dependency, DSN uses a
+hierarchical information propagation method to learn category representations
+that could better describe the difference between posts. DSN also exploits
+recurrent networks with a series of gating layers for more flexible local
+temporal processing abilities and multi-head attention for long-term
+dependencies. The experimental results on the Social Media Popularity Dataset
+demonstrate the superiority of our method compared to existing state-of-the-art
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Learning Behaviour of In-context Learning: A
+  Comparison with Supervised Learning <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15411v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15411v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xindi Wang, Yufei Wang, Can Xu, Xiubo Geng, Bowen Zhang, Chongyang Tao, Frank Rudzicz, Robert E. Mercer, Daxin Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown remarkable capacity for in-context
+learning (ICL), where learning a new task from just a few training examples is
+done without being explicitly pre-trained. However, despite the success of
+LLMs, there has been little understanding of how ICL learns the knowledge from
+the given prompts. In this paper, to make progress toward understanding the
+learning behaviour of ICL, we train the same LLMs with the same demonstration
+examples via ICL and supervised learning (SL), respectively, and investigate
+their performance under label perturbations (i.e., noisy labels and label
+imbalance) on a range of classification tasks. First, via extensive
+experiments, we find that gold labels have significant impacts on the
+downstream in-context performance, especially for large language models;
+however, imbalanced labels matter little to ICL across all model sizes. Second,
+when comparing with SL, we show empirically that ICL is less sensitive to label
+perturbations than SL, and ICL gradually attains comparable performance to SL
+as the model size increases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards a Fully Unsupervised Framework for Intent Induction in Customer
+  Support Dialogues 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15410v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15410v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rita Costa, Bruno Martins, Sérgio Viana, Luisa Coheur
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State of the art models in intent induction require annotated datasets.
+However, annotating dialogues is time-consuming, laborious and expensive. In
+this work, we propose a completely unsupervised framework for intent induction
+within a dialogue. In addition, we show how pre-processing the dialogue corpora
+can improve results. Finally, we show how to extract the dialogue flows of
+intentions by investigating the most common sequences. Although we test our
+work in the MultiWOZ dataset, the fact that this framework requires no prior
+knowledge make it applicable to any possible use case, making it very relevant
+to real world customer support applications across industry.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multilingual Tourist Assistance using Chat<span class="highlight-title">GPT</span>: Comparing Capabilities in
+  Hindi, Telugu, and Kannada 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15376v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15376v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanjana Kolar, Rohit Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research investigates the effectiveness of ChatGPT, an AI language model
+by OpenAI, in translating English into Hindi, Telugu, and Kannada languages,
+aimed at assisting tourists in India's linguistically diverse environment. To
+measure the translation quality, a test set of 50 questions from diverse fields
+such as general knowledge, food, and travel was used. These were assessed by
+five volunteers for accuracy and fluency, and the scores were subsequently
+converted into a BLEU score. The BLEU score evaluates the closeness of a
+machine-generated translation to a human translation, with a higher score
+indicating better translation quality. The Hindi translations outperformed
+others, showcasing superior accuracy and fluency, whereas Telugu translations
+lagged behind. Human evaluators rated both the accuracy and fluency of
+translations, offering a comprehensive perspective on the language model's
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Med-HALT: Medical Domain Hallucination Test for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Logesh Kumar Umapathi, Ankit Pal, Malaikannan Sankarasubbu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research paper focuses on the challenges posed by hallucinations in
+large language models (LLMs), particularly in the context of the medical
+domain. Hallucination, wherein these models generate plausible yet unverified
+or incorrect information, can have serious consequences in healthcare
+applications. We propose a new benchmark and dataset, Med-HALT (Medical Domain
+Hallucination Test), designed specifically to evaluate and reduce
+hallucinations. Med-HALT provides a diverse multinational dataset derived from
+medical examinations across various countries and includes multiple innovative
+testing modalities. Med-HALT includes two categories of tests reasoning and
+memory-based hallucination tests, designed to assess LLMs's problem-solving and
+information retrieval abilities.
+  Our study evaluated leading LLMs, including Text Davinci, GPT-3.5, LlaMa-2,
+MPT, and Falcon, revealing significant differences in their performance. The
+paper provides detailed insights into the dataset, promoting transparency and
+reproducibility. Through this work, we aim to contribute to the development of
+safer and more reliable language models in healthcare. Our benchmark can be
+found at medhalt.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Teach Me How to Improve My Argumentation Skills: A <span class="highlight-title">Survey</span> on Feedback in
+  Argumentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Camélia Guerraoui, Paul Reisert, Naoya Inoue, Farjana Sultana Mim, Shoichi Naito, Jungmin Choi, Irfan Robbani, Wenzhi Wang, Kentaro Inui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of argumentation in education has been shown to improve critical
+thinking skills for end-users such as students, and computational models for
+argumentation have been developed to assist in this process. Although these
+models are useful for evaluating the quality of an argument, they oftentimes
+cannot explain why a particular argument is considered poor or not, which makes
+it difficult to provide constructive feedback to users to strengthen their
+critical thinking skills. In this survey, we aim to explore the different
+dimensions of feedback (Richness, Visualization, Interactivity, and
+Personalization) provided by the current computational models for
+argumentation, and the possibility of enhancing the power of explanations of
+such models, ultimately helping learners improve their critical thinking
+skills.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Skeleton-of-Thought: Large Language Models Can Do Parallel Decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15337v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15337v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuefei Ning, Zinan Lin, Zixuan Zhou, Huazhong Yang, Yu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work aims at decreasing the end-to-end generation latency of large
+language models (LLMs). One of the major causes of the high generation latency
+is the sequential decoding approach adopted by almost all state-of-the-art
+LLMs. In this work, motivated by the thinking and writing process of humans, we
+propose "Skeleton-of-Thought" (SoT), which guides LLMs to first generate the
+skeleton of the answer, and then conducts parallel API calls or batched
+decoding to complete the contents of each skeleton point in parallel. Not only
+does SoT provide considerable speed-up (up to 2.39x across 11 different LLMs),
+but it can also potentially improve the answer quality on several question
+categories in terms of diversity and relevance. SoT is an initial attempt at
+data-centric optimization for efficiency, and reveal the potential of pushing
+LLMs to think more like a human for answer quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report, work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BARTPhoBEiT: <span class="highlight-title">Pre-train</span>ed Sequence-to-Sequence and Image <span class="highlight-title">Transformer</span>s
+  Models for Vietnamese Visual Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khiem Vinh Tran, Kiet Van Nguyen, Ngan Luu Thuy Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Question Answering (VQA) is an intricate and demanding task that
+integrates natural language processing (NLP) and computer vision (CV),
+capturing the interest of researchers. The English language, renowned for its
+wealth of resources, has witnessed notable advancements in both datasets and
+models designed for VQA. However, there is a lack of models that target
+specific countries such as Vietnam. To address this limitation, we introduce a
+transformer-based Vietnamese model named BARTPhoBEiT. This model includes
+pre-trained Sequence-to-Sequence and bidirectional encoder representation from
+Image Transformers in Vietnamese and evaluates Vietnamese VQA datasets.
+Experimental results demonstrate that our proposed model outperforms the strong
+baseline and improves the state-of-the-art in six metrics: Accuracy, Precision,
+Recall, F1-score, WUPS 0.0, and WUPS 0.9.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tutorials on Stance Detection using <span class="highlight-title">Pre-train</span>ed Language Models:
+  Fine-tuning <span class="highlight-title">BERT</span> and <span class="highlight-title">Prompt</span>ing Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun-Shiuan Chuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents two self-contained tutorials on stance detection in
+Twitter data using BERT fine-tuning and prompting large language models (LLMs).
+The first tutorial explains BERT architecture and tokenization, guiding users
+through training, tuning, and evaluating standard and domain-specific BERT
+models with HuggingFace transformers. The second focuses on constructing
+prompts and few-shot examples to elicit stances from ChatGPT and open-source
+FLAN-T5 without fine-tuning. Various prompting strategies are implemented and
+evaluated using confusion matrices and macro F1 scores. The tutorials provide
+code, visualizations, and insights revealing the strengths of few-shot ChatGPT
+and FLAN-T5 which outperform fine-tuned BERTs. By covering both model
+fine-tuning and prompting-based techniques in an accessible, hands-on manner,
+these tutorials enable learners to gain applied experience with cutting-edge
+methods for stance detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TrafficSafety<span class="highlight-title">GPT</span>: Tuning a <span class="highlight-title">Pre-train</span>ed Large Language Model to a
+  Domain-Specific Expert in Transportation Safety 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ou Zheng, Mohamed Abdel-Aty, Dongdong Wang, Chenzhu Wang, Shengxuan Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown remarkable effectiveness in various
+general-domain natural language processing (NLP) tasks. However, their
+performance in transportation safety domain tasks has been suboptimal,
+primarily attributed to the requirement for specialized transportation safety
+expertise in generating accurate responses [1]. To address this challenge, we
+introduce TrafficSafetyGPT, a novel LLAMA-based model, which has undergone
+supervised fine-tuning using TrafficSafety-2K dataset which has human labels
+from government produced guiding books and ChatGPT-generated instruction-output
+pairs. Our proposed TrafficSafetyGPT model and TrafficSafety-2K train dataset
+are accessible at https://github.com/ozheng1993/TrafficSafetyGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WC-S<span class="highlight-title">BERT</span>: Zero-Shot Text Classification via S<span class="highlight-title">BERT</span> with Self-Training for
+  Wikipedia Categories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15293v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15293v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Te-Yu Chi, Yu-Meng Tang, Chia-Wen Lu, Qiu-Xia Zhang, Jyh-Shing Roger Jang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our research focuses on solving the zero-shot text classification problem in
+NLP, with a particular emphasis on innovative self-training strategies. To
+achieve this objective, we propose a novel self-training strategy that uses
+labels rather than text for training, significantly reducing the model's
+training time. Specifically, we use categories from Wikipedia as our training
+set and leverage the SBERT pre-trained model to establish positive correlations
+between pairs of categories within the same text, facilitating associative
+training. For new test datasets, we have improved the original self-training
+approach, eliminating the need for prior training and testing data from each
+target dataset. Instead, we adopt Wikipedia as a unified training dataset to
+better approximate the zero-shot scenario. This modification allows for rapid
+fine-tuning and inference across different datasets, greatly reducing the time
+required for self-training. Our experimental results demonstrate that this
+method can adapt the model to the target dataset within minutes. Compared to
+other BERT-based transformer models, our approach significantly reduces the
+amount of training data by training only on labels, not the actual text, and
+greatly improves training efficiency by utilizing a unified training set.
+Additionally, our method achieves state-of-the-art results on both the Yahoo
+Topic and AG News datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ChatHome: Development and Evaluation of a Domain-Specific Language Model
+  for Home Renovation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15290v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15290v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Wen, Xianghui Sun, Shuaijiang Zhao, Xiaoquan Fang, Liangyu Chen, Wei Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the development and evaluation of ChatHome, a
+domain-specific language model (DSLM) designed for the intricate field of home
+renovation. Considering the proven competencies of large language models (LLMs)
+like GPT-4 and the escalating fascination with home renovation, this study
+endeavors to reconcile these aspects by generating a dedicated model that can
+yield high-fidelity, precise outputs relevant to the home renovation arena.
+ChatHome's novelty rests on its methodology, fusing domain-adaptive pretraining
+and instruction-tuning over an extensive dataset. This dataset includes
+professional articles, standard documents, and web content pertinent to home
+renovation. This dual-pronged strategy is designed to ensure that our model can
+assimilate comprehensive domain knowledge and effectively address user
+inquiries. Via thorough experimentation on diverse datasets, both universal and
+domain-specific, including the freshly introduced "EvalHome" domain dataset, we
+substantiate that ChatHome not only amplifies domain-specific functionalities
+but also preserves its versatility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ChatHome,DSLM for home renovation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multilingual Lexical Simplification via Paraphrase Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kang Liu, Jipeng Qiang, Yun Li, Yunhao Yuan, Yi Zhu, Kaixun Hua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lexical simplification (LS) methods based on pretrained language models have
+made remarkable progress, generating potential substitutes for a complex word
+through analysis of its contextual surroundings. However, these methods require
+separate pretrained models for different languages and disregard the
+preservation of sentence meaning. In this paper, we propose a novel
+multilingual LS method via paraphrase generation, as paraphrases provide
+diversity in word selection while preserving the sentence's meaning. We regard
+paraphrasing as a zero-shot translation task within multilingual neural machine
+translation that supports hundreds of languages. After feeding the input
+sentence into the encoder of paraphrase modeling, we generate the substitutes
+based on a novel decoding strategy that concentrates solely on the lexical
+variations of the complex word. Experimental results demonstrate that our
+approach surpasses BERT-based methods and zero-shot GPT3-based method
+significantly on English, Spanish, and Portuguese.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Overview</span> of Robust and Multilingual Automatic Evaluation Metrics\\for
+  Open-Domain Dialogue Systems at DSTC 11 Track 4 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12794v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12794v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mario Rodríguez-Cantelar, Chen Zhang, Chengguang Tang, Ke Shi, Sarik Ghazarian, João Sedoc, Luis Fernando D'Haro, Alexander Rudnicky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent and fast development of neural networks have revolutionized the
+research on dialogue systems and subsequently have triggered various challenges
+regarding their automatic evaluation. Automatic evaluation of open-domain
+dialogue systems as an open challenge has been the center of the attention of
+many researchers. Despite the consistent efforts to improve automatic metrics'
+correlations with human evaluation, there have been very few attempts to assess
+their robustness over multiple domains and dimensions. Also, their focus is
+mainly on the English language. All of these challenges prompt the development
+of automatic evaluation metrics that are reliable in various domains,
+dimensions, and languages. This track in the 11th Dialogue System Technology
+Challenge (DSTC11) is part of the ongoing effort to promote robust and
+multilingual automatic evaluation metrics. This article describes the datasets
+and baselines provided to participants and discusses the submission and result
+details of the two proposed subtasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Matching Patients to Clinical Trials with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15051v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15051v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiao Jin, Zifeng Wang, Charalampos S. Floudas, Jimeng Sun, Zhiyong Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical trials are vital in advancing drug development and evidence-based
+medicine, but their success is often hindered by challenges in patient
+recruitment. In this work, we investigate the potential of large language
+models (LLMs) to assist individual patients and referral physicians in
+identifying suitable clinical trials from an extensive selection. Specifically,
+we introduce TrialGPT, a novel architecture employing LLMs to predict
+criterion-level eligibility with detailed explanations, which are then
+aggregated for ranking and excluding candidate clinical trials based on
+free-text patient notes. We evaluate TrialGPT on three publicly available
+cohorts of 184 patients and 18,238 annotated clinical trials. The experimental
+results demonstrate several key findings: First, TrialGPT achieves high
+criterion-level prediction accuracy with faithful explanations. Second, the
+aggregated trial-level TrialGPT scores are highly correlated with expert
+eligibility annotations. Third, these scores prove effective in ranking
+clinical trials and exclude ineligible candidates. Our error analysis suggests
+that current LLMs still make some mistakes due to limited medical knowledge and
+domain-specific context understanding. Nonetheless, we believe the explanatory
+capabilities of LLMs are highly valuable. Future research is warranted on how
+such AI assistants can be integrated into the routine trial matching workflow
+in real-world settings to improve its efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Lexical Simplification for Turkish 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.05878v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.05878v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmet Yavuz Uluslu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present the first automatic lexical simplification system
+for the Turkish language. Recent text simplification efforts rely on manually
+crafted simplified corpora and comprehensive NLP tools that can analyse the
+target text both in word and sentence levels. Turkish is a morphologically rich
+agglutinative language that requires unique considerations such as the proper
+handling of inflectional cases. Being a low-resource language in terms of
+available resources and industrial-strength tools, it makes the text
+simplification task harder to approach. We present a new text simplification
+pipeline based on pretrained representation model BERT together with
+morphological features to generate grammatically correct and semantically
+appropriate word-level simplifications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Turkish Native Language Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14850v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14850v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmet Yavuz Uluslu, Gerold Schneider
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present the first application of Native Language
+Identification (NLI) for the Turkish language. NLI involves predicting the
+writer's first language by analysing their writing in different languages.
+While most NLI research has focused on English, our study extends its scope to
+Turkish. We used the recently constructed Turkish Learner Corpus and employed a
+combination of three syntactic features (CFG production rules, part-of-speech
+n-grams, and function words) with L2 texts to demonstrate their effectiveness
+in this task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Multimodal Prediction of Spontaneous Humour: A Novel <span class="highlight-title">Dataset</span> and
+  First Results 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.14272v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.14272v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Christ, Shahin Amiriparian, Alexander Kathan, Niklas Müller, Andreas König, Björn W. Schuller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humour is a substantial element of human affect and cognition. Its automatic
+understanding can facilitate a more naturalistic human-device interaction and
+the humanisation of artificial intelligence. Current methods of humour
+detection are solely based on staged data making them inadequate for
+'real-world' applications. We address this deficiency by introducing the novel
+Passau-Spontaneous Football Coach Humour (Passau-SFCH) dataset, comprising of
+about 11 hours of recordings. The Passau-SFCH dataset is annotated for the
+presence of humour and its dimensions (sentiment and direction) as proposed in
+Martin's Humor Style Questionnaire. We conduct a series of experiments,
+employing pretrained Transformers, convolutional neural networks, and
+expert-designed features. The performance of each modality (text, audio, video)
+for spontaneous humour recognition is analysed and their complementarity is
+investigated. Our findings suggest that for the automatic analysis of humour
+and its sentiment, facial expressions are most promising, while humour
+direction can be best modelled via text-based features. The results reveal
+considerable differences among various subjects, highlighting the individuality
+of humour usage and style. Further, we observe that a decision-level fusion
+yields the best recognition result. Finally, we make our code publicly
+available at https://www.github.com/EIHW/passau-sfch. The Passau-SFCH dataset
+is available upon request.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible (Major Revision)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Meta-learner via Gradient Similarity for Few-shot Text
+  Classification <span class="chip">COLING 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.04702v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.04702v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Lei, Honghui Hu, Qiaoyang Luo, Dezhong Peng, Xu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot text classification aims to classify the text under the few-shot
+scenario. Most of the previous methods adopt optimization-based meta learning
+to obtain task distribution. However, due to the neglect of matching between
+the few amount of samples and complicated models, as well as the distinction
+between useful and useless task features, these methods suffer from the
+overfitting issue. To address this issue, we propose a novel Adaptive
+Meta-learner via Gradient Similarity (AMGS) method to improve the model
+generalization ability to a new task. Specifically, the proposed AMGS
+alleviates the overfitting based on two aspects: (i) acquiring the potential
+semantic representation of samples and improving model generalization through
+the self-supervised auxiliary task in the inner loop, (ii) leveraging the
+adaptive meta-learner via gradient similarity to add constraints on the
+gradient obtained by base-learner in the outer loop. Moreover, we make a
+systematic analysis of the influence of regularization on the entire framework.
+Experimental results on several benchmarks demonstrate that the proposed AMGS
+consistently improves few-shot text classification performance compared with
+the state-of-the-art optimization-based meta-learning approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>COLING 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Open-domain Paradox for Chatbots: Common Ground as the Basis for
+  Human-like Dialogue <span class="chip">SIGDIAL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11708v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11708v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Skantze, A. Seza Doğruöz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a surge in interest in the development of open-domain chatbots,
+driven by the recent advancements of large language models. The "openness" of
+the dialogue is expected to be maximized by providing minimal information to
+the users about the common ground they can expect, including the presumed joint
+activity. However, evidence suggests that the effect is the opposite. Asking
+users to "just chat about anything" results in a very narrow form of dialogue,
+which we refer to as the "open-domain paradox". In this position paper, we
+explain this paradox through the theory of common ground as the basis for
+human-like communication. Furthermore, we question the assumptions behind
+open-domain chatbots and identify paths forward for enabling common ground in
+human-computer dialogue.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at SIGDIAL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SANTA: Separate Strategies for Inaccurate and Incomplete Annotation
+  Noise in Distantly-Supervised Named Entity Recognition <span class="chip">ACL2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04076v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04076v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuzheng Si, Zefan Cai, Shuang Zeng, Guoqiang Feng, Jiaxing Lin, Baobao Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distantly-Supervised Named Entity Recognition effectively alleviates the
+burden of time-consuming and expensive annotation in the supervised setting.
+But the context-free matching process and the limited coverage of knowledge
+bases introduce inaccurate and incomplete annotation noise respectively.
+Previous studies either considered only incomplete annotation noise or
+indiscriminately handle two types of noise with the same strategy. In this
+paper, we argue that the different causes of two types of noise bring up the
+requirement of different strategies in model architecture. Therefore, we
+propose the SANTA to handle these two types of noise separately with (1)
+Memory-smoothed Focal Loss and Entity-aware KNN to relieve the entity ambiguity
+problem caused by inaccurate annotation, and (2) Boundary Mixup to alleviate
+decision boundary shifting problem caused by incomplete annotation and a
+noise-tolerant loss to improve the robustness. Benefiting from our separate
+tailored strategies, we confirm in the experiment that the two types of noise
+are well mitigated. SANTA also achieves a new state-of-the-art on five public
+datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Local Spectro-Temporal Features for Speech Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10270v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10270v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Guerzhoy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce the problem of phone classification in the context of speech
+recognition, and explore several sets of local spectro-temporal features that
+can be used for phone classification. In particular, we present some
+preliminary results for phone classification using two sets of features that
+are commonly used for object detection: Haar features and SVM-classified
+Histograms of Gradients (HoG).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Master's project, University of Toronto, 2010</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ARB: Advanced Reasoning Benchmark for Large Language Models <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13692v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13692v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomohiro Sawada, Daniel Paleka, Alexander Havrilla, Pranav Tadepalli, Paula Vidas, Alexander Kranias, John J. Nay, Kshitij Gupta, Aran Komatsuzaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable performance on
+various quantitative reasoning and knowledge benchmarks. However, many of these
+benchmarks are losing utility as LLMs get increasingly high scores, despite not
+yet reaching expert performance in these domains. We introduce ARB, a novel
+benchmark composed of advanced reasoning problems in multiple fields. ARB
+presents a more challenging test than prior benchmarks, featuring problems in
+mathematics, physics, biology, chemistry, and law. As a subset of ARB, we
+introduce a challenging set of math and physics problems which require advanced
+symbolic reasoning and domain knowledge. We evaluate recent models such as
+GPT-4 and Claude on ARB and demonstrate that current models score well below
+50% on more demanding tasks. In order to improve both automatic and assisted
+evaluation capabilities, we introduce a rubric-based evaluation approach,
+allowing GPT-4 to score its own intermediate reasoning steps. Further, we
+conduct a human evaluation of the symbolic subset of ARB, finding promising
+agreement between annotators and GPT-4 rubric evaluation scores.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to NeurIPS Datasets and Benchmarks Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">83</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semi-Supervised Object Detection in the Open World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15710v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15710v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Garvita Allabadi, Ana Lucic, Peter Pao-Huang, Yu-Xiong Wang, Vikram Adve
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing approaches for semi-supervised object detection assume a fixed set
+of classes present in training and unlabeled datasets, i.e., in-distribution
+(ID) data. The performance of these techniques significantly degrades when
+these techniques are deployed in the open-world, due to the fact that the
+unlabeled and test data may contain objects that were not seen during training,
+i.e., out-of-distribution (OOD) data. The two key questions that we explore in
+this paper are: can we detect these OOD samples and if so, can we learn from
+them? With these considerations in mind, we propose the Open World
+Semi-supervised Detection framework (OWSSD) that effectively detects OOD data
+along with a semi-supervised learning pipeline that learns from both ID and OOD
+data. We introduce an ensemble based OOD detector consisting of lightweight
+auto-encoder networks trained only on ID data. Through extensive evalulation,
+we demonstrate that our method performs competitively against state-of-the-art
+OOD detection algorithms and also significantly boosts the semi-supervised
+learning performance in open-world scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MeMOTR: Long-Term Memory-Augmented <span class="highlight-title">Transformer</span> for Multi-Object Tracking <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruopeng Gao, Limin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a video task, Multi-Object Tracking (MOT) is expected to capture temporal
+information of targets effectively. Unfortunately, most existing methods only
+explicitly exploit the object features between adjacent frames, while lacking
+the capacity to model long-term temporal information. In this paper, we propose
+MeMOTR, a long-term memory-augmented Transformer for multi-object tracking. Our
+method is able to make the same object's track embedding more stable and
+distinguishable by leveraging long-term memory injection with a customized
+memory-attention layer. This significantly improves the target association
+ability of our model. Experimental results on DanceTrack show that MeMOTR
+impressively surpasses the state-of-the-art method by 7.9\% and 13.0\% on HOTA
+and AssA metrics, respectively. Furthermore, our model also outperforms other
+Transformer-based methods on association performance on MOT17 and generalizes
+well on BDD100K. Code is available at
+\href{https://github.com/MCG-NJU/MeMOTR}{https://github.com/MCG-NJU/MeMOTR}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SimDETR: Simplifying <span class="highlight-title">self-supervised</span> <span class="highlight-title">pretrain</span>ing for DETR 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15697v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15697v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioannis Maniadis Metaxas, Adrian Bulat, Ioannis Patras, Brais Martinez, Georgios Tzimiropoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DETR-based object detectors have achieved remarkable performance but are
+sample-inefficient and exhibit slow convergence. Unsupervised pretraining has
+been found to be helpful to alleviate these impediments, allowing training with
+large amounts of unlabeled data to improve the detector's performance. However,
+existing methods have their own limitations, like keeping the detector's
+backbone frozen in order to avoid performance degradation and utilizing
+pretraining objectives misaligned with the downstream task. To overcome these
+limitations, we propose a simple pretraining framework for DETR-based detectors
+that consists of three simple yet key ingredients: (i) richer, semantics-based
+initial proposals derived from high-level feature maps, (ii) discriminative
+training using object pseudo-labels produced via clustering, (iii)
+self-training to take advantage of the improved object proposals learned by the
+detector. We report two main findings: (1) Our pretraining outperforms prior
+DETR pretraining works on both the full and low data regimes by significant
+margins. (2) We show we can pretrain DETR from scratch (including the backbone)
+directly on complex image datasets like COCO, paving the path for unsupervised
+representation learning directly using DETR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PatchMixer: Rethinking network design to boost generalization for 3D
+  point cloud understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15692v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15692v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Boscaini, Fabio Poiesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent trend in deep learning methods for 3D point cloud understanding is
+to propose increasingly sophisticated architectures either to better capture 3D
+geometries or by introducing possibly undesired inductive biases. Moreover,
+prior works introducing novel architectures compared their performance on the
+same domain, devoting less attention to their generalization to other domains.
+We argue that the ability of a model to transfer the learnt knowledge to
+different domains is an important feature that should be evaluated to
+exhaustively assess the quality of a deep network architecture. In this work we
+propose PatchMixer, a simple yet effective architecture that extends the ideas
+behind the recent MLP-Mixer paper to 3D point clouds. The novelties of our
+approach are the processing of local patches instead of the whole shape to
+promote robustness to partial point clouds, and the aggregation of patch-wise
+features using an MLP as a simpler alternative to the graph convolutions or the
+attention mechanisms that are used in prior works. We evaluated our method on
+the shape classification and part segmentation tasks, achieving superior
+generalization performance compared to a selection of the most relevant deep
+architectures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the Image and Vision Computing journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TrackAgent: 6D Object Tracking via Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15671v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15671v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantin Röhrl, Dominik Bauer, Timothy Patten, Markus Vincze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tracking an object's 6D pose, while either the object itself or the observing
+camera is moving, is important for many robotics and augmented reality
+applications. While exploiting temporal priors eases this problem,
+object-specific knowledge is required to recover when tracking is lost. Under
+the tight time constraints of the tracking task, RGB(D)-based methods are often
+conceptionally complex or rely on heuristic motion models. In comparison, we
+propose to simplify object tracking to a reinforced point cloud (depth only)
+alignment task. This allows us to train a streamlined approach from scratch
+with limited amounts of sparse 3D point clouds, compared to the large datasets
+of diverse RGBD sequences required in previous works. We incorporate temporal
+frame-to-frame registration with object-based recovery by frame-to-model
+refinement using a reinforcement learning (RL) agent that jointly solves for
+both objectives. We also show that the RL agent's uncertainty and a
+rendering-based mask propagation are effective reinitialization triggers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Conference on Computer Vision Systems (ICVS) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-layer Aggregation as a key to feature-based OOD detection <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15647v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15647v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Lambert, Florence Forbes, Senan Doyle, Michel Dojat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Learning models are easily disturbed by variations in the input images
+that were not observed during the training stage, resulting in unpredictable
+predictions. Detecting such Out-of-Distribution (OOD) images is particularly
+crucial in the context of medical image analysis, where the range of possible
+abnormalities is extremely wide. Recently, a new category of methods has
+emerged, based on the analysis of the intermediate features of a trained model.
+These methods can be divided into 2 groups: single-layer methods that consider
+the feature map obtained at a fixed, carefully chosen layer, and multi-layer
+methods that consider the ensemble of the feature maps generated by the model.
+While promising, a proper comparison of these algorithms is still lacking. In
+this work, we compared various feature-based OOD detection methods on a large
+spectra of OOD (20 types), representing approximately 7800 3D MRIs. Our
+experiments shed the light on two phenomenons. First, multi-layer methods
+consistently outperform single-layer approaches, which tend to have
+inconsistent behaviour depending on the type of anomaly. Second, the OOD
+detection performance highly depends on the architecture of the underlying
+neural network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the Workshop on Uncertainty for Safe
+  Utilization of Machine Learning in Medical Imaging (UNSURE) at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scale-aware Test-time Click Adaptation for Pulmonary Nodule and Mass
+  Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15645v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15645v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Li, Jiancheng Yang, Yongchao Xu, Li Zhang, Wenhui Dong, Bo Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pulmonary nodules and masses are crucial imaging features in lung cancer
+screening that require careful management in clinical diagnosis. Despite the
+success of deep learning-based medical image segmentation, the robust
+performance on various sizes of lesions of nodule and mass is still
+challenging. In this paper, we propose a multi-scale neural network with
+scale-aware test-time adaptation to address this challenge. Specifically, we
+introduce an adaptive Scale-aware Test-time Click Adaptation method based on
+effortlessly obtainable lesion clicks as test-time cues to enhance segmentation
+performance, particularly for large lesions. The proposed method can be
+seamlessly integrated into existing networks. Extensive experiments on both
+open-source and in-house datasets consistently demonstrate the effectiveness of
+the proposed method over some CNN and Transformer-based segmentation methods.
+Our code is available at https://github.com/SplinterLi/SaTTCA
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 3 figures, MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Data Generation in Vision-and-Language Navigation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15644v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15644v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zun Wang, Jialu Li, Yicong Hong, Yi Wang, Qi Wu, Mohit Bansal, Stephen Gould, Hao Tan, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research in language-guided visual navigation has demonstrated a
+significant demand for the diversity of traversable environments and the
+quantity of supervision for training generalizable agents. To tackle the common
+data scarcity issue in existing vision-and-language navigation datasets, we
+propose an effective paradigm for generating large-scale data for learning,
+which applies 1200+ photo-realistic environments from HM3D and Gibson datasets
+and synthesizes 4.9 million instruction trajectory pairs using fully-accessible
+resources on the web. Importantly, we investigate the influence of each
+component in this paradigm on the agent's performance and study how to
+adequately apply the augmented data to pre-train and fine-tune an agent. Thanks
+to our large-scale dataset, the performance of an existing agent can be pushed
+up (+11% absolute with regard to previous SoTA) to a significantly new best of
+80% single-run success rate on the R2R test split by simple imitation learning.
+The long-lasting generalization gap between navigating in seen and unseen
+environments is also reduced to less than 1% (versus 8% in the previous best
+method). Moreover, our paradigm also facilitates different models to achieve
+new state-of-the-art navigation results on CVDN, REVERIE, and R2R in continuous
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLIP Brings Better Features to Visual Aesthetics Learners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15640v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15640v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liwu Xu, Jinjin Xu, Yuzhe Yang, Yijie Huang, Yanchun Xie, Yaqian Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of pre-training approaches on a variety of downstream tasks has
+revitalized the field of computer vision. Image aesthetics assessment (IAA) is
+one of the ideal application scenarios for such methods due to subjective and
+expensive labeling procedure. In this work, an unified and flexible two-phase
+\textbf{C}LIP-based \textbf{S}emi-supervised \textbf{K}nowledge
+\textbf{D}istillation paradigm is proposed, namely \textbf{\textit{CSKD}}.
+Specifically, we first integrate and leverage a multi-source unlabeled dataset
+to align rich features between a given visual encoder and an off-the-shelf CLIP
+image encoder via feature alignment loss. Notably, the given visual encoder is
+not limited by size or structure and, once well-trained, it can seamlessly
+serve as a better visual aesthetic learner for both student and teacher. In the
+second phase, the unlabeled data is also utilized in semi-supervised IAA
+learning to further boost student model performance when applied in
+latency-sensitive production scenarios. By analyzing the attention distance and
+entropy before and after feature alignment, we notice an alleviation of feature
+collapse issue, which in turn showcase the necessity of feature alignment
+instead of training directly based on CLIP image encoder. Extensive experiments
+indicate the superiority of CSKD, which achieves state-of-the-art performance
+on multiple widely used IAA benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TriadNet: Sampling-free predictive intervals for lesional volume in 3D
+  brain MR images <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Lambert, Florence Forbes, Senan Doyle, Michel Dojat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The volume of a brain lesion (e.g. infarct or tumor) is a powerful indicator
+of patient prognosis and can be used to guide the therapeutic strategy.
+Lesional volume estimation is usually performed by segmentation with deep
+convolutional neural networks (CNN), currently the state-of-the-art approach.
+However, to date, few work has been done to equip volume segmentation tools
+with adequate quantitative predictive intervals, which can hinder their
+usefulness and acceptation in clinical practice. In this work, we propose
+TriadNet, a segmentation approach relying on a multi-head CNN architecture,
+which provides both the lesion volumes and the associated predictive intervals
+simultaneously, in less than a second. We demonstrate its superiority over
+other solutions on BraTS 2021, a large-scale MRI glioblastoma image database.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the Workshop on Uncertainty for Safe
+  Utilization of Machine Learning in Medical Imaging (UNSURE) at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Deep Learning in Medical Image Registration: New
+  Technologies, Uncertainty, Evaluation Metrics, and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15615v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15615v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyu Chen, Yihao Liu, Shuwen Wei, Zhangxing Bian, Shalini Subramanian, Aaron Carass, Jerry L. Prince, Yong Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past decade, deep learning technologies have greatly advanced the
+field of medical image registration. The initial developments, such as
+ResNet-based and U-Net-based networks, laid the groundwork for deep
+learning-driven image registration. Subsequent progress has been made in
+various aspects of deep learning-based registration, including similarity
+measures, deformation regularizations, and uncertainty estimation. These
+advancements have not only enriched the field of deformable image registration
+but have also facilitated its application in a wide range of tasks, including
+atlas construction, multi-atlas segmentation, motion estimation, and 2D-3D
+registration. In this paper, we present a comprehensive overview of the most
+recent advancements in deep learning-based image registration. We begin with a
+concise introduction to the core concepts of deep learning-based image
+registration. Then, we delve into innovative network architectures, loss
+functions specific to registration, and methods for estimating registration
+uncertainty. Additionally, this paper explores appropriate evaluation metrics
+for assessing the performance of deep learning models in registration tasks.
+Finally, we highlight the practical applications of these novel techniques in
+medical imaging and discuss the future prospects of deep learning-based image
+registration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrated Digital Reconstruction of Welded Components: Supporting
+  Improved Fatigue Life Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15604v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15604v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anders Faarbæk Mikkelstrup, Morten Kristiansen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the design of offshore jacket foundations, fatigue life is crucial.
+Post-weld treatment has been proposed to enhance the fatigue performance of
+welded joints, where particularly high-frequency mechanical impact (HFMI)
+treatment has been shown to improve fatigue performance significantly.
+Automated HFMI treatment has improved quality assurance and can lead to
+cost-effective design when combined with accurate fatigue life prediction.
+However, the finite element method (FEM), commonly used for predicting fatigue
+life in complex or multi-axial joints, relies on a basic CAD depiction of the
+weld, failing to consider the actual weld geometry and defects. Including the
+actual weld geometry in the FE model improves fatigue life prediction and
+possible crack location prediction but requires a digital reconstruction of the
+weld. Current digital reconstruction methods are time-consuming or require
+specialised scanning equipment and potential component relocation. The proposed
+framework instead uses an industrial manipulator combined with a line scanner
+to integrate digital reconstruction as part of the automated HFMI treatment
+setup. This approach applies standard image processing, simple filtering
+techniques, and non-linear optimisation for aligning and merging overlapping
+scans. A screened Poisson surface reconstruction finalises the 3D model to
+create a meshed surface. The outcome is a generic, cost-effective, flexible,
+and rapid method that enables generic digital reconstruction of welded parts,
+aiding in component design, overall quality assurance, and documentation of the
+HFMI treatment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures, submitted to 2023 IEEE International Conference
+  on Imaging Systems and Techniques (IST2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OAFuser: Towards Omni-Aperture Fusion for Light Field Semantic
+  Segmentation of Road Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fei Teng, Jiaming Zhang, Kunyu Peng, Kailun Yang, Yaonan Wang, Rainer Stiefelhagen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Light field cameras can provide rich angular and spatial information to
+enhance image semantic segmentation for scene understanding in the field of
+autonomous driving. However, the extensive angular information of light field
+cameras contains a large amount of redundant data, which is overwhelming for
+the limited hardware resource of intelligent vehicles. Besides, inappropriate
+compression leads to information corruption and data loss. To excavate
+representative information, we propose an Omni-Aperture Fusion model (OAFuser),
+which leverages dense context from the central view and discovers the angular
+information from sub-aperture images to generate a semantically-consistent
+result. To avoid feature loss during network propagation and simultaneously
+streamline the redundant information from the light field camera, we present a
+simple yet very effective Sub-Aperture Fusion Module (SAFM) to embed
+sub-aperture images into angular features without any additional memory cost.
+Furthermore, to address the mismatched spatial information across viewpoints,
+we present Center Angular Rectification Module (CARM) realized feature
+resorting and prevent feature occlusion caused by asymmetric information. Our
+proposed OAFuser achieves state-of-the-art performance on the UrbanLF-Real and
+-Syn datasets and sets a new record of 84.93% in mIoU on the UrbanLF-Real
+Extended dataset, with a gain of +4.53%. The source code of OAFuser will be
+made publicly available at https://github.com/FeiBryantkit/OAFuser.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The source code of OAFuser will be made publicly available at
+  https://github.com/FeiBryantkit/OAFuser</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Point Clouds Are Specialized Images: A Knowledge Transfer Approach for
+  3D Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15569v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15569v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachen Kang, Wenjing Jia, Xiangjian He, Kin Man Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised representation learning (SSRL) has gained increasing
+attention in point cloud understanding, in addressing the challenges posed by
+3D data scarcity and high annotation costs. This paper presents PCExpert, a
+novel SSRL approach that reinterprets point clouds as "specialized images".
+This conceptual shift allows PCExpert to leverage knowledge derived from
+large-scale image modality in a more direct and deeper manner, via extensively
+sharing the parameters with a pre-trained image encoder in a multi-way
+Transformer architecture. The parameter sharing strategy, combined with a novel
+pretext task for pre-training, i.e., transformation estimation, empowers
+PCExpert to outperform the state of the arts in a variety of tasks, with a
+remarkable reduction in the number of trainable parameters. Notably, PCExpert's
+performance under LINEAR fine-tuning (e.g., yielding a 90.02% overall accuracy
+on ScanObjectNN) has already approached the results obtained with FULL model
+fine-tuning (92.66%), demonstrating its effective and robust representation
+capability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Panoptic Scene Graph Generation with Semantics-prototype Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15567v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15567v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Li, Wei Ji, Yiming Wu, Mengze Li, You Qin, Lina Wei, Roger Zimmermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Panoptic Scene Graph Generation (PSG) parses objects and predicts their
+relationships (predicate) to connect human language and visual scenes. However,
+different language preferences of annotators and semantic overlaps between
+predicates lead to biased predicate annotations in the dataset, i.e. different
+predicates for same object pairs. Biased predicate annotations make PSG models
+struggle in constructing a clear decision plane among predicates, which greatly
+hinders the real application of PSG models. To address the intrinsic bias
+above, we propose a novel framework named ADTrans to adaptively transfer biased
+predicate annotations to informative and unified ones. To promise consistency
+and accuracy during the transfer process, we propose to measure the invariance
+of representations in each predicate class, and learn unbiased prototypes of
+predicates with different intensities. Meanwhile, we continuously measure the
+distribution changes between each presentation and its prototype, and
+constantly screen potential biased data. Finally, with the unbiased
+predicate-prototype representation embedding space, biased annotations are
+easily identified. Experiments show that ADTrans significantly improves the
+performance of benchmark models, achieving a new state-of-the-art performance,
+and shows great generalization and effectiveness on multiple datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Backdoor Defense with Non-Adversarial Backdoor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15539v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15539v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Liu, Alberto Sangiovanni-Vincentelli, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) are vulnerable to backdoor attack, which does not
+affect the network's performance on clean data but would manipulate the network
+behavior once a trigger pattern is added. Existing defense methods have greatly
+reduced attack success rate, but their prediction accuracy on clean data still
+lags behind a clean model by a large margin. Inspired by the stealthiness and
+effectiveness of backdoor attack, we propose a simple but highly effective
+defense framework which injects non-adversarial backdoors targeting poisoned
+samples. Following the general steps in backdoor attack, we detect a small set
+of suspected samples and then apply a poisoning strategy to them. The
+non-adversarial backdoor, once triggered, suppresses the attacker's backdoor on
+poisoned data, but has limited influence on clean data. The defense can be
+carried out during data preprocessing, without any modification to the standard
+end-to-end training pipeline. We conduct extensive experiments on multiple
+benchmarks with different architectures and representative attacks. Results
+demonstrate that our method achieves state-of-the-art defense effectiveness
+with by far the lowest performance drop on clean data. Considering the
+surprising defense ability displayed by our framework, we call for more
+attention to utilizing backdoor for backdoor defense. Code is available at
+https://github.com/damianliumin/non-adversarial_backdoor.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Few-shot Image Classification based on Gradual Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15524v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15524v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Na Chen, Xianming Kuang, Feiyu Liu, Kehao Wang, Qun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot image classification aims to accurately classify unlabeled images
+using only a few labeled samples. The state-of-the-art solutions are built by
+deep learning, which focuses on designing increasingly complex deep backbones.
+Unfortunately, the task remains very challenging due to the difficulty of
+transferring the knowledge learned in training classes to new ones. In this
+paper, we propose a novel approach based on the non-i.i.d paradigm of gradual
+machine learning (GML). It begins with only a few labeled observations, and
+then gradually labels target images in the increasing order of hardness by
+iterative factor inference in a factor graph. Specifically, our proposed
+solution extracts indicative feature representations by deep backbones, and
+then constructs both unary and binary factors based on the extracted features
+to facilitate gradual learning. The unary factors are constructed based on
+class center distance in an embedding space, while the binary factors are
+constructed based on k-nearest neighborhood. We have empirically validated the
+performance of the proposed approach on benchmark datasets by a comparative
+study. Our extensive experiments demonstrate that the proposed approach can
+improve the SOTA performance by 1-5% in terms of accuracy. More notably, it is
+more robust than the existing deep models in that its performance can
+consistently improve as the size of query set increases while the performance
+of deep models remains essentially flat or even becomes worse.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages,6 figures,5 tables, 55 conferences</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ YOLOv8 for Defect Inspection of Hexagonal Directed Self-Assembly
+  Patterns: A Data-Centric Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15516v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15516v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enrique Dehaerne, Bappaditya Dey, Hossein Esfandiar, Lander Verstraete, Hyo Seon Suh, Sandip Halder, Stefan De Gendt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Shrinking pattern dimensions leads to an increased variety of defect types in
+semiconductor devices. This has spurred innovation in patterning approaches
+such as Directed self-assembly (DSA) for which no traditional, automatic defect
+inspection software exists. Machine Learning-based SEM image analysis has
+become an increasingly popular research topic for defect inspection with
+supervised ML models often showing the best performance. However, little
+research has been done on obtaining a dataset with high-quality labels for
+these supervised models. In this work, we propose a method for obtaining
+coherent and complete labels for a dataset of hexagonal contact hole DSA
+patterns while requiring minimal quality control effort from a DSA expert. We
+show that YOLOv8, a state-of-the-art neural network, achieves defect detection
+precisions of more than 0.9 mAP on our final dataset which best reflects DSA
+expert defect labeling expectations. We discuss the strengths and limitations
+of our proposed labeling approach and suggest directions for future work in
+data-centric ML-based defect inspection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 10 figures, accepted for the 38th EMLC Conference 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Fully Convolutional Geometric Features for Object 6D Pose
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15514v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15514v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaime Corsetti, Davide Boscaini, Fabio Poiesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works on 6D object pose estimation focus on learning keypoint
+correspondences between images and object models, and then determine the object
+pose through RANSAC-based algorithms or by directly regressing the pose with
+end-to-end optimisations. We argue that learning point-level discriminative
+features is overlooked in the literature. To this end, we revisit Fully
+Convolutional Geometric Features (FCGF) and tailor it for object 6D pose
+estimation to achieve state-of-the-art performance. FCGF employs sparse
+convolutions and learns point-level features using a fully-convolutional
+network by optimising a hardest contrastive loss. We can outperform recent
+competitors on popular benchmarks by adopting key modifications to the loss and
+to the input data representations, by carefully tuning the training strategies,
+and by employing data augmentations suitable for the underlying problem. We
+carry out a thorough ablation to study the contribution of each modification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages. Preprint, currently under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Image Quality of Sparse-view Lung Cancer CT Images with a
+  Convolutional Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15506v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15506v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Annika Ries, Tina Dorosti, Johannes Thalhammer, Daniel Sasse, Andreas Sauter, Felix Meurer, Ashley Benne, Franz Pfeiffer, Daniela Pfeiffer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: To improve the image quality of sparse-view computed tomography (CT)
+images with a U-Net for lung cancer detection and to determine the best
+trade-off between number of views, image quality, and diagnostic confidence.
+  Methods: CT images from 41 subjects (34 with lung cancer, seven healthy) were
+retrospectively selected (01.2016-12.2018) and forward projected onto 2048-view
+sinograms. Six corresponding sparse-view CT data subsets at varying levels of
+undersampling were reconstructed from sinograms using filtered backprojection
+with 16, 32, 64, 128, 256, and 512 views, respectively. A dual-frame U-Net was
+trained and evaluated for each subsampling level on 8,658 images from 22
+diseased subjects. A representative image per scan was selected from 19
+subjects (12 diseased, seven healthy) for a single-blinded reader study. The
+selected slices, for all levels of subsampling, with and without
+post-processing by the U-Net model, were presented to three readers. Image
+quality and diagnostic confidence were ranked using pre-defined scales.
+Subjective nodule segmentation was evaluated utilizing sensitivity (Se) and
+Dice Similarity Coefficient (DSC) with 95% confidence intervals (CI).
+  Results: The 64-projection sparse-view images resulted in Se = 0.89 and DSC =
+0.81 [0.75,0.86] while their counterparts, post-processed with the U-Net, had
+improved metrics (Se = 0.94, DSC = 0.85 [0.82,0.87]). Fewer views lead to
+insufficient quality for diagnostic purposes. For increased views, no
+substantial discrepancies were noted between the sparse-view and post-processed
+images.
+  Conclusion: Projection views can be reduced from 2048 to 64 while maintaining
+image quality and the confidence of the radiologists on a satisfactory level.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-invasive Diabetes Detection using Gabor Filter: A Comparative
+  Analysis of Different Cameras 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15480v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15480v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christina A. Garcia, Patricia Angela R. Abu, Rosula SJ. Reyes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper compares and explores the performance of both mobile device camera
+and laptop camera as convenient tool for capturing images for non-invasive
+detection of Diabetes Mellitus (DM) using facial block texture features.
+Participants within age bracket 20 to 79 years old were chosen for the dataset.
+12mp and 7mp mobile cameras, and a laptop camera were used to take the photo
+under normal lighting condition. Extracted facial blocks were classified using
+k-Nearest Neighbors (k-NN) and Support Vector Machine (SVM). 100 images were
+captured, preprocessed, filtered using Gabor, and iterated. Performance of the
+system was measured in terms of accuracy, specificity, and sensitivity. Best
+performance of 96.7% accuracy, 100% sensitivity, and 93% specificity were
+achieved from 12mp back camera using SVM with 100 images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures, 3 tables, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Local and Global Information in Obstacle Detection on Railway Tracks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Brucker, Andrei Cramariuc, Cornelius von Einem, Roland Siegwart, Cesar Cadena
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reliable obstacle detection on railways could help prevent collisions that
+result in injuries and potentially damage or derail the train. Unfortunately,
+generic object detectors do not have enough classes to account for all possible
+scenarios, and datasets featuring objects on railways are challenging to
+obtain. We propose utilizing a shallow network to learn railway segmentation
+from normal railway images. The limited receptive field of the network prevents
+overconfident predictions and allows the network to focus on the locally very
+distinct and repetitive patterns of the railway environment. Additionally, we
+explore the controlled inclusion of global information by learning to
+hallucinate obstacle-free images. We evaluate our method on a custom dataset
+featuring railway images with artificially augmented obstacles. Our proposed
+method outperforms other learning-based baseline methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Defocus Blur Synthesis and Deblurring via Interpolation and
+  Extrapolation in Latent Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioana Mazilu, Shunxin Wang, Sven Dummer, Raymond Veldhuis, Christoph Brune, Nicola Strisciuglio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Though modern microscopes have an autofocusing system to ensure optimal
+focus, out-of-focus images can still occur when cells within the medium are not
+all in the same focal plane, affecting the image quality for medical diagnosis
+and analysis of diseases. We propose a method that can deblur images as well as
+synthesize defocus blur. We train autoencoders with implicit and explicit
+regularization techniques to enforce linearity relations among the
+representations of different blur levels in the latent space. This allows for
+the exploration of different blur levels of an object by linearly
+interpolating/extrapolating the latent representations of images taken at
+different focal planes. Compared to existing works, we use a simple
+architecture to synthesize images with flexible blur levels, leveraging the
+linear latent space. Our regularized autoencoders can effectively mimic blur
+and deblur, increasing data variety as a data augmentation technique and
+improving the quality of microscopic images, which would be beneficial for
+further processing and analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at CAIP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Modal Concept Learning and Inference for Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15460v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15460v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Zhang, Ce Zhang, Yushun Tang, Zhihai He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale pre-trained Vision-Language Models (VLMs), such as CLIP,
+establish the correlation between texts and images, achieving remarkable
+success on various downstream tasks with fine-tuning. In existing fine-tuning
+methods, the class-specific text description is matched against the whole
+image. We recognize that this whole image matching is not effective since
+images from the same class often contain a set of different semantic objects,
+and an object further consists of a set of semantic parts or concepts.
+Individual semantic parts or concepts may appear in image samples from
+different classes. To address this issue, in this paper, we develop a new
+method called cross-model concept learning and inference (CCLI). Using the
+powerful text-image correlation capability of CLIP, our method automatically
+learns a large set of distinctive visual concepts from images using a set of
+semantic text concepts. Based on these visual concepts, we construct a
+discriminative representation of images and learn a concept inference network
+to perform downstream image classification tasks, such as few-shot learning and
+domain generalization. Extensive experimental results demonstrate that our CCLI
+method is able to improve the performance upon the current state-of-the-art
+methods by large margins, for example, by up to 8.0% improvement on few-shot
+learning and by up to 1.3% for domain generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ERCPMP: An Endoscopic Image and Video <span class="highlight-title">Dataset</span> for Colorectal Polyps
+  Morphology and Pathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15444v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15444v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mojgan Forootan, Mohsen Rajabnia, Ahmad R Mafi, Hamed Azhdari Tehrani, Erfan Ghadirzadeh, Mahziar Setayeshfar, Zahra Ghaffari, Mohammad Tashakoripour, Mohammad Reza Zali, Hamidreza Bolhasani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the recent years, artificial intelligence (AI) and its leading subtypes,
+machine learning (ML) and deep learning (DL) and their applications are
+spreading very fast in various aspects such as medicine. Today the most
+important challenge of developing accurate algorithms for medical prediction,
+detection, diagnosis, treatment and prognosis is data. ERCPMP is an Endoscopic
+Image and Video Dataset for Recognition of Colorectal Polyps Morphology and
+Pathology. This dataset contains demographic, morphological and pathological
+data, endoscopic images and videos of 191 patients with colorectal polyps.
+Morphological data is included based on the latest international
+gastroenterology classification references such as Paris, Pit and JNET
+classification. Pathological data includes the diagnosis of the polyps
+including Tubular, Villous, Tubulovillous, Hyperplastic, Serrated, Inflammatory
+and Adenocarcinoma with Dysplasia Grade & Differentiation. The current version
+of this dataset is published and available on Elsevier Mendeley Dataverse and
+since it is under development, the latest version is accessible via:
+https://databiox.com.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Visual Monitoring of Nocturnal Insects with Light-based Camera
+  Traps <span class="chip">CVPR2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15433v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15433v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitri Korsch, Paul Bodesheim, Gunnar Brehm, Joachim Denzler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic camera-assisted monitoring of insects for abundance estimations is
+crucial to understand and counteract ongoing insect decline. In this paper, we
+present two datasets of nocturnal insects, especially moths as a subset of
+Lepidoptera, photographed in Central Europe. One of the datasets, the EU-Moths
+dataset, was captured manually by citizen scientists and contains species
+annotations for 200 different species and bounding box annotations for those.
+We used this dataset to develop and evaluate a two-stage pipeline for insect
+detection and moth species classification in previous work. We further
+introduce a prototype for an automated visual monitoring system. This prototype
+produced the second dataset consisting of more than 27,000 images captured on
+95 nights. For evaluation and bootstrapping purposes, we annotated a subset of
+the images with bounding boxes enframing nocturnal insects. Finally, we present
+first detection and classification baselines for these datasets and encourage
+other scientists to use this publicly available data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at the FGVC workshop at the CVPR2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improvable Gap Balancing for Multi-Task Learning <span class="chip">UAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanqi Dai, Nanyi Fei, Zhiwu Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In multi-task learning (MTL), gradient balancing has recently attracted more
+research interest than loss balancing since it often leads to better
+performance. However, loss balancing is much more efficient than gradient
+balancing, and thus it is still worth further exploration in MTL. Note that
+prior studies typically ignore that there exist varying improvable gaps across
+multiple tasks, where the improvable gap per task is defined as the distance
+between the current training progress and desired final training progress.
+Therefore, after loss balancing, the performance imbalance still arises in many
+cases. In this paper, following the loss balancing framework, we propose two
+novel improvable gap balancing (IGB) algorithms for MTL: one takes a simple
+heuristic, and the other (for the first time) deploys deep reinforcement
+learning for MTL. Particularly, instead of directly balancing the losses in
+MTL, both algorithms choose to dynamically assign task weights for improvable
+gap balancing. Moreover, we combine IGB and gradient balancing to show the
+complementarity between the two types of algorithms. Extensive experiments on
+two benchmark datasets demonstrate that our IGB algorithms lead to the best
+results in MTL via loss balancing and achieve further improvements when
+combined with gradient balancing. Code is available at
+https://github.com/YanqiDai/IGB4MTL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for the 39th Conference on Uncertainty in Artificial
+  Intelligence (UAI 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implicit neural representation for change detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Naylor, Diego Di Carlo, Arianna Traviglia, Makoto Yamada, Marco Fiorucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting changes that occurred in a pair of 3D airborne LiDAR point clouds,
+acquired at two different times over the same geographical area, is a
+challenging task because of unmatching spatial supports and acquisition system
+noise. Most recent attempts to detect changes on point clouds are based on
+supervised methods, which require large labelled data unavailable in real-world
+applications. To address these issues, we propose an unsupervised approach that
+comprises two components: Neural Field (NF) for continuous shape reconstruction
+and a Gaussian Mixture Model for categorising changes. NF offer a grid-agnostic
+representation to encode bi-temporal point clouds with unmatched spatial
+support that can be regularised to increase high-frequency details and reduce
+noise. The reconstructions at each timestamp are compared at arbitrary spatial
+scales, leading to a significant increase in detection capabilities. We apply
+our method to a benchmark dataset of simulated LiDAR point clouds for urban
+sprawling. The dataset offers different challenging scenarios with different
+resolutions, input modalities and noise levels, allowing a multi-scenario
+comparison of our method with the current state-of-the-art. We boast the
+previous methods on this dataset by a 10% margin in intersection over union
+metric. In addition, we apply our methods to a real-world scenario to identify
+illegal excavation (looting) of archaeological sites and confirm that they
+match findings from field experts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Main article is 10 pages + 3 pages of supplementary. Conference style
+  paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning Pipeline for Automated Visual Moth Monitoring: Insect
+  Localization and Species Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15427v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15427v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitri Korsch, Paul Bodesheim, Joachim Denzler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biodiversity monitoring is crucial for tracking and counteracting adverse
+trends in population fluctuations. However, automatic recognition systems are
+rarely applied so far, and experts evaluate the generated data masses manually.
+Especially the support of deep learning methods for visual monitoring is not
+yet established in biodiversity research, compared to other areas like
+advertising or entertainment. In this paper, we present a deep learning
+pipeline for analyzing images captured by a moth scanner, an automated visual
+monitoring system of moth species developed within the AMMOD project. We first
+localize individuals with a moth detector and afterward determine the species
+of detected insects with a classifier. Our detector achieves up to 99.01% mean
+average precision and our classifier distinguishes 200 moth species with an
+accuracy of 93.13% on image cutouts depicting single insects. Combining both in
+our pipeline improves the accuracy for species identification in images of the
+moth scanner from 79.62% to 88.05%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MLIC++: Linear Complexity Multi-Reference Entropy Modeling for Learned
+  Image Compression <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Jiang, Ronggang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, multi-reference entropy model has been proposed, which captures
+channel-wise, local spatial, and global spatial correlations. Previous works
+adopt attention for global correlation capturing, however, the quadratic
+cpmplexity limits the potential of high-resolution image coding. In this paper,
+we propose the linear complexity global correlations capturing, via the
+decomposition of softmax operation. Based on it, we propose the MLIC$^{++}$, a
+learned image compression with linear complexity for multi-reference entropy
+modeling. Our MLIC$^{++}$ is more efficient and it reduces BD-rate by 12.44% on
+the Kodak dataset compared to VTM-17.0 when measured in PSNR. Code will be
+available at https://github.com/JiangWeibeta/MLIC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2023 Neural Compression Workshop. Extension work of
+  our ACMMM 2023 paper MLIC: Multi-Reference Entropy Model for Learned Image
+  Compression</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty-aware Unsupervised Multi-Object Tracking <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Liu, Sheng Jin, Zhihang Fu, Ze Chen, Rongxin Jiang, Jieping Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Without manually annotated identities, unsupervised multi-object trackers are
+inferior to learning reliable feature embeddings. It causes the
+similarity-based inter-frame association stage also be error-prone, where an
+uncertainty problem arises. The frame-by-frame accumulated uncertainty prevents
+trackers from learning the consistent feature embedding against time variation.
+To avoid this uncertainty problem, recent self-supervised techniques are
+adopted, whereas they failed to capture temporal relations. The interframe
+uncertainty still exists. In fact, this paper argues that though the
+uncertainty problem is inevitable, it is possible to leverage the uncertainty
+itself to improve the learned consistency in turn. Specifically, an
+uncertainty-based metric is developed to verify and rectify the risky
+associations. The resulting accurate pseudo-tracklets boost learning the
+feature consistency. And accurate tracklets can incorporate temporal
+information into spatial transformation. This paper proposes a tracklet-guided
+augmentation strategy to simulate tracklets' motion, which adopts a
+hierarchical uncertainty-based sampling mechanism for hard sample mining. The
+ultimate unsupervised MOT framework, namely U2MOT, is proven effective on
+MOT-Challenges and VisDrone-MOT benchmark. U2MOT achieves a SOTA performance
+among the published supervised and unsupervised trackers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Conference on Computer Vision (ICCV) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AffineGlue: Joint Matching and Robust Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15381v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15381v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Barath, Dmytro Mishkin, Luca Cavalli, Paul-Edouard Sarlin, Petr Hruby, Marc Pollefeys
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose AffineGlue, a method for joint two-view feature matching and
+robust estimation that reduces the combinatorial complexity of the problem by
+employing single-point minimal solvers. AffineGlue selects potential matches
+from one-to-many correspondences to estimate minimal models. Guided matching is
+then used to find matches consistent with the model, suffering less from the
+ambiguities of one-to-one matches. Moreover, we derive a new minimal solver for
+homography estimation, requiring only a single affine correspondence (AC) and a
+gravity prior. Furthermore, we train a neural network to reject ACs that are
+unlikely to lead to a good model. AffineGlue is superior to the SOTA on
+real-world datasets, even when assuming that the gravity direction points
+downwards. On PhotoTourism, the AUC@10{\deg} score is improved by 6.6 points
+compared to the SOTA. On ScanNet, AffineGlue makes SuperPoint and SuperGlue
+achieve similar accuracy as the detector-free LoFTR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span> Guided <span class="highlight-title">Transformer</span> for Multi-Task Dense Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15362v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15362v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxiang Lu, Shalayiding Sirejiding, Yue Ding, Chunlin Wang, Hongtao Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Task-conditional architecture offers advantage in parameter efficiency but
+falls short in performance compared to state-of-the-art multi-decoder methods.
+How to trade off performance and model parameters is an important and difficult
+problem. In this paper, we introduce a simple and lightweight task-conditional
+model called Prompt Guided Transformer (PGT) to optimize this challenge. Our
+approach designs a Prompt-conditioned Transformer block, which incorporates
+task-specific prompts in the self-attention mechanism to achieve global
+dependency modeling and parameter-efficient feature adaptation across multiple
+tasks. This block is integrated into both the shared encoder and decoder,
+enhancing the capture of intra- and inter-task features. Moreover, we design a
+lightweight decoder to further reduce parameter usage, which accounts for only
+2.7% of the total model parameters. Extensive experiments on two multi-task
+dense prediction benchmarks, PASCAL-Context and NYUD-v2, demonstrate that our
+approach achieves state-of-the-art results among task-conditional methods while
+using fewer parameters, and maintains a significant balance between performance
+and parameter size.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Supervised Homography Learning with Realistic <span class="highlight-title">Dataset</span> Generation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15353v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15353v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hai Jiang, Haipeng Li, Songchen Han, Haoqiang Fan, Bing Zeng, Shuaicheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose an iterative framework, which consists of two
+phases: a generation phase and a training phase, to generate realistic training
+data and yield a supervised homography network. In the generation phase, given
+an unlabeled image pair, we utilize the pre-estimated dominant plane masks and
+homography of the pair, along with another sampled homography that serves as
+ground truth to generate a new labeled training pair with realistic motion. In
+the training phase, the generated data is used to train the supervised
+homography network, in which the training data is refined via a content
+consistency module and a quality assessment module. Once an iteration is
+finished, the trained network is used in the next data generation phase to
+update the pre-estimated homography. Through such an iterative strategy, the
+quality of the dataset and the performance of the network can be gradually and
+simultaneously improved. Experimental results show that our method achieves
+state-of-the-art performance and existing supervised methods can be also
+improved based on the generated dataset. Code and dataset are available at
+https://github.com/megvii-research/RealSH.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Radon Signed Cumulative Distribution Transform and its applications
+  in classification of Signed Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Gong, Shiying Li, Naqib Sad Pathan, Mohammad Shifat-E-Rabbi, Gustavo K. Rohde, Abu Hasnat Mohammad Rubaiyat, Sumati Thareja
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Here we describe a new image representation technique based on the
+mathematics of transport and optimal transport. The method relies on the
+combination of the well-known Radon transform for images and a recent signal
+representation method called the Signed Cumulative Distribution Transform. The
+newly proposed method generalizes previous transport-related image
+representation methods to arbitrary functions (images), and thus can be used in
+more applications. We describe the new transform, and some of its mathematical
+properties and demonstrate its ability to partition image classes with real and
+simulated data. In comparison to existing transport transform methods, as well
+as deep learning-based classification methods, the new transform more
+accurately represents the information content of signed images, and thus can be
+used to obtain higher classification accuracies. The implementation of the
+proposed method in Python language is integrated as a part of the software
+package PyTransKit, available on Github.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BARTPhoBEiT: <span class="highlight-title">Pre-train</span>ed Sequence-to-Sequence and Image <span class="highlight-title">Transformer</span>s
+  Models for Vietnamese Visual Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khiem Vinh Tran, Kiet Van Nguyen, Ngan Luu Thuy Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Question Answering (VQA) is an intricate and demanding task that
+integrates natural language processing (NLP) and computer vision (CV),
+capturing the interest of researchers. The English language, renowned for its
+wealth of resources, has witnessed notable advancements in both datasets and
+models designed for VQA. However, there is a lack of models that target
+specific countries such as Vietnam. To address this limitation, we introduce a
+transformer-based Vietnamese model named BARTPhoBEiT. This model includes
+pre-trained Sequence-to-Sequence and bidirectional encoder representation from
+Image Transformers in Vietnamese and evaluates Vietnamese VQA datasets.
+Experimental results demonstrate that our proposed model outperforms the strong
+baseline and improves the state-of-the-art in six metrics: Accuracy, Precision,
+Recall, F1-score, WUPS 0.0, and WUPS 0.9.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic PlenOctree for Adaptive Sampling Refinement in Explicit NeRF <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Bai, Yiqi Lin, Yize Chen, Lin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The explicit neural radiance field (NeRF) has gained considerable interest
+for its efficient training and fast inference capabilities, making it a
+promising direction such as virtual reality and gaming. In particular,
+PlenOctree (POT)[1], an explicit hierarchical multi-scale octree
+representation, has emerged as a structural and influential framework. However,
+POT's fixed structure for direct optimization is sub-optimal as the scene
+complexity evolves continuously with updates to cached color and density,
+necessitating refining the sampling distribution to capture signal complexity
+accordingly. To address this issue, we propose the dynamic PlenOctree DOT,
+which adaptively refines the sample distribution to adjust to changing scene
+complexity. Specifically, DOT proposes a concise yet novel hierarchical feature
+fusion strategy during the iterative rendering process. Firstly, it identifies
+the regions of interest through training signals to ensure adaptive and
+efficient refinement. Next, rather than directly filtering out valueless nodes,
+DOT introduces the sampling and pruning operations for octrees to aggregate
+features, enabling rapid parameter learning. Compared with POT, our DOT
+outperforms it by enhancing visual quality, reducing over $55.15$/$68.84\%$
+parameters, and providing 1.7/1.9 times FPS for NeRF-synthetic and Tanks $\&$
+Temples, respectively. Project homepage:https://vlislab22.github.io/DOT.
+  [1] Yu, Alex, et al. "Plenoctrees for real-time rendering of neural radiance
+fields." Proceedings of the IEEE/CVF International Conference on Computer
+Vision. 2021.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Staging E-Commerce Products for Online Advertising using Retrieval
+  Assisted Image Generation <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueh-Ning Ku, Mikhail Kuznetsov, Shaunak Mishra, Paloma de Juan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online ads showing e-commerce products typically rely on the product images
+in a catalog sent to the advertising platform by an e-commerce platform. In the
+broader ads industry such ads are called dynamic product ads (DPA). It is
+common for DPA catalogs to be in the scale of millions (corresponding to the
+scale of products which can be bought from the e-commerce platform). However,
+not all product images in the catalog may be appealing when directly
+re-purposed as an ad image, and this may lead to lower click-through rates
+(CTRs). In particular, products just placed against a solid background may not
+be as enticing and realistic as a product staged in a natural environment. To
+address such shortcomings of DPA images at scale, we propose a generative
+adversarial network (GAN) based approach to generate staged backgrounds for
+un-staged product images. Generating the entire staged background is a
+challenging task susceptible to hallucinations. To get around this, we
+introduce a simpler approach called copy-paste staging using retrieval assisted
+GANs. In copy paste staging, we first retrieve (from the catalog) staged
+products similar to the un-staged input product, and then copy-paste the
+background of the retrieved product in the input image. A GAN based in-painting
+model is used to fill the holes left after this copy-paste operation. We show
+the efficacy of our copy-paste staging method via offline metrics, and human
+evaluation. In addition, we show how our staging approach can enable animations
+of moving products leading to a video ad from a product image.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in AdKDD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TaskExpert: Dynamically Assembling Multi-Task Representations with
+  Memorial Mixture-of-Experts <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanrong Ye, Dan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning discriminative task-specific features simultaneously for multiple
+distinct tasks is a fundamental problem in multi-task learning. Recent
+state-of-the-art models consider directly decoding task-specific features from
+one shared task-generic feature (e.g., feature from a backbone layer), and
+utilize carefully designed decoders to produce multi-task features. However, as
+the input feature is fully shared and each task decoder also shares decoding
+parameters for different input samples, it leads to a static feature decoding
+process, producing less discriminative task-specific representations. To tackle
+this limitation, we propose TaskExpert, a novel multi-task mixture-of-experts
+model that enables learning multiple representative task-generic feature spaces
+and decoding task-specific features in a dynamic manner. Specifically,
+TaskExpert introduces a set of expert networks to decompose the backbone
+feature into several representative task-generic features. Then, the
+task-specific features are decoded by using dynamic task-specific gating
+networks operating on the decomposed task-generic features. Furthermore, to
+establish long-range modeling of the task-specific representations from
+different layers of TaskExpert, we design a multi-task feature memory that
+updates at each layer and acts as an additional feature expert for dynamic
+task-specific feature decoding. Extensive experiments demonstrate that our
+TaskExpert clearly outperforms previous best-performing methods on all 9
+metrics of two competitive multi-task learning benchmarks for visual scene
+understanding (i.e., PASCAL-Context and NYUD-v2). Codes and models will be made
+publicly available at https://github.com/prismformore/Multi-Task-Transformer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Visual Sim-to-Real Transfer for Robotic Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ricardo Garcia, Robin Strudel, Shizhe Chen, Etienne Arlaud, Ivan Laptev, Cordelia Schmid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning visuomotor policies in simulation is much safer and cheaper than in
+the real world. However, due to discrepancies between the simulated and real
+data, simulator-trained policies often fail when transferred to real robots.
+One common approach to bridge the visual sim-to-real domain gap is domain
+randomization (DR). While previous work mainly evaluates DR for disembodied
+tasks, such as pose estimation and object detection, here we systematically
+explore visual domain randomization methods and benchmark them on a rich set of
+challenging robotic manipulation tasks. In particular, we propose an off-line
+proxy task of cube localization to select DR parameters for texture
+randomization, lighting randomization, variations of object colors and camera
+parameters. Notably, we demonstrate that DR parameters have similar impact on
+our off-line proxy task and on-line policies. We, hence, use off-line optimized
+DR parameters to train visuomotor policies in simulation and directly apply
+such policies to a real robot. Our approach achieves 93% success rate on
+average when tested on a diverse set of challenging manipulation tasks.
+Moreover, we evaluate the robustness of policies to visual variations in real
+scenes and show that our simulator-trained policies outperform policies learned
+using real but limited data. Code, simulation environment, real robot datasets
+and trained models are available at
+https://www.di.ens.fr/willow/research/robust_s2r/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DocDeshadower: Frequency-aware <span class="highlight-title">Transformer</span> for Document Shadow Removal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15318v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15318v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenghong Luo, Ruifeng Xu, Xuhang Chen, Zinuo Li, Chi-Man Pun, Shuqiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The presence of shadows significantly impacts the visual quality of scanned
+documents. However, the existing traditional techniques and deep learning
+methods used for shadow removal have several limitations. These methods either
+rely heavily on heuristics, resulting in suboptimal performance, or require
+large datasets to learn shadow-related features. In this study, we propose the
+DocDeshadower, a multi-frequency Transformer-based model built on Laplacian
+Pyramid. DocDeshadower is designed to remove shadows at different frequencies
+in a coarse-to-fine manner. To achieve this, we decompose the shadow image into
+different frequency bands using Laplacian Pyramid. In addition, we introduce
+two novel components to this model: the Attention-Aggregation Network and the
+Gated Multi-scale Fusion Transformer. The Attention-Aggregation Network is
+designed to remove shadows in the low-frequency part of the image, whereas the
+Gated Multi-scale Fusion Transformer refines the entire image at a global scale
+with its large perceptive field. Our extensive experiments demonstrate that
+DocDeshadower outperforms the current state-of-the-art methods in both
+qualitative and quantitative terms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffKendall: A Novel Approach for Few-Shot Learning with Differentiable
+  Kendall's Rank Correlation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaipeng Zheng, Huishuai Zhang, Weiran Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot learning aims to adapt models trained on the base dataset to novel
+tasks where the categories are not seen by the model before. This often leads
+to a relatively uniform distribution of feature values across channels on novel
+classes, posing challenges in determining channel importance for novel tasks.
+Standard few-shot learning methods employ geometric similarity metrics such as
+cosine similarity and negative Euclidean distance to gauge the semantic
+relatedness between two features. However, features with high geometric
+similarities may carry distinct semantics, especially in the context of
+few-shot learning. In this paper, we demonstrate that the importance ranking of
+feature channels is a more reliable indicator for few-shot learning than
+geometric similarity metrics. We observe that replacing the geometric
+similarity metric with Kendall's rank correlation only during inference is able
+to improve the performance of few-shot learning across a wide range of datasets
+with different domains. Furthermore, we propose a carefully designed
+differentiable loss for meta-training to address the non-differentiability
+issue of Kendall's rank correlation. Extensive experiments demonstrate that the
+proposed rank-correlation-based approach substantially enhances few-shot
+learning performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attentive Multimodal Fusion for Optical and Scene Flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15301v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15301v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youjie Zhou, Guofeng Mei, Yiming Wang, Fabio Poiesi, Yi Wan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an investigation into the estimation of optical and scene
+flow using RGBD information in scenarios where the RGB modality is affected by
+noise or captured in dark environments. Existing methods typically rely solely
+on RGB images or fuse the modalities at later stages, which can result in lower
+accuracy when the RGB information is unreliable. To address this issue, we
+propose a novel deep neural network approach named FusionRAFT, which enables
+early-stage information fusion between sensor modalities (RGB and depth). Our
+approach incorporates self- and cross-attention layers at different network
+levels to construct informative features that leverage the strengths of both
+modalities. Through comparative experiments, we demonstrate that our approach
+outperforms recent methods in terms of performance on the synthetic dataset
+Flyingthings3D, as well as the generalization on the real-world dataset KITTI.
+We illustrate that our approach exhibits improved robustness in the presence of
+noise and low-lighting conditions that affect the RGB images. We release the
+code, models and dataset at https://github.com/jiesico/FusionRAFT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted for publication in IEEE Robotics and Automation
+  Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AC-Norm: Effective Tuning for Medical Image Analysis via Affine
+  Collaborative Normalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuyan Zhang, Yuncheng Yang, Hao Zheng, Yun Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Driven by the latest trend towards self-supervised learning (SSL), the
+paradigm of "pretraining-then-finetuning" has been extensively explored to
+enhance the performance of clinical applications with limited annotations.
+Previous literature on model finetuning has mainly focused on regularization
+terms and specific policy models, while the misalignment of channels between
+source and target models has not received sufficient attention. In this work,
+we revisited the dynamics of batch normalization (BN) layers and observed that
+the trainable affine parameters of BN serve as sensitive indicators of domain
+information. Therefore, Affine Collaborative Normalization (AC-Norm) is
+proposed for finetuning, which dynamically recalibrates the channels in the
+target model according to the cross-domain channel-wise correlations without
+adding extra parameters. Based on a single-step backpropagation, AC-Norm can
+also be utilized to measure the transferability of pretrained models. We
+evaluated AC-Norm against the vanilla finetuning and state-of-the-art
+fine-tuning methods on transferring diverse pretrained models to the diabetic
+retinopathy grade classification, retinal vessel segmentation, CT lung nodule
+segmentation/classification, CT liver-tumor segmentation and MRI cardiac
+segmentation tasks. Extensive experiments demonstrate that AC-Norm unanimously
+outperforms the vanilla finetuning by up to 4% improvement, even under
+significant domain shifts where the state-of-the-art methods bring no gains. We
+also prove the capability of AC-Norm in fast transferability estimation. Our
+code is available at https://github.com/EndoluminalSurgicalVision-IMR/ACNorm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recovering high-quality FODs from a reduced number of diffusion-weighted
+  images using a model-driven deep learning architecture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        J Bartlett, C E Davey, L A Johnston, J Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fibre orientation distribution (FOD) reconstruction using deep learning has
+the potential to produce accurate FODs from a reduced number of
+diffusion-weighted images (DWIs), decreasing total imaging time. Diffusion
+acquisition invariant representations of the DWI signals are typically used as
+input to these methods to ensure that they can be applied flexibly to data with
+different b-vectors and b-values; however, this means the network cannot
+condition its output directly on the DWI signal. In this work, we propose a
+spherical deconvolution network, a model-driven deep learning FOD
+reconstruction architecture, that ensures intermediate and output FODs produced
+by the network are consistent with the input DWI signals. Furthermore, we
+implement a fixel classification penalty within our loss function, encouraging
+the network to produce FODs that can subsequently be segmented into the correct
+number of fixels and improve downstream fixel-based analysis. Our results show
+that the model-based deep learning architecture achieves competitive
+performance compared to a state-of-the-art FOD super-resolution network,
+FOD-Net. Moreover, we show that the fixel classification penalty can be tuned
+to offer improved performance with respect to metrics that rely on accurately
+segmented of FODs. Our code is publicly available at
+https://github.com/Jbartlett6/SDNet .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures, This work has been submitted to the IEEE for
+  possible publication. Copyright may be transferred without notice, after
+  which this version may no longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anatomy-Aware Lymph Node Detection in Chest CT using Implicit Station
+  Stratification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15271v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15271v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Yan, Dakai Jin, Dazhou Guo, Minfeng Xu, Na Shen, Xian-Sheng Hua, Xianghua Ye, Le Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finding abnormal lymph nodes in radiological images is highly important for
+various medical tasks such as cancer metastasis staging and radiotherapy
+planning. Lymph nodes (LNs) are small glands scattered throughout the body.
+They are grouped or defined to various LN stations according to their
+anatomical locations. The CT imaging appearance and context of LNs in different
+stations vary significantly, posing challenges for automated detection,
+especially for pathological LNs. Motivated by this observation, we propose a
+novel end-to-end framework to improve LN detection performance by leveraging
+their station information. We design a multi-head detector and make each head
+focus on differentiating the LN and non-LN structures of certain stations.
+Pseudo station labels are generated by an LN station classifier as a form of
+multi-task learning during training, so we do not need another explicit LN
+station prediction model during inference. Our algorithm is evaluated on 82
+patients with lung cancer and 91 patients with esophageal cancer. The proposed
+implicit station stratification method improves the detection sensitivity of
+thoracic lymph nodes from 65.1% to 71.4% and from 80.3% to 85.5% at 2 false
+positives per patient on the two datasets, respectively, which significantly
+outperforms various existing state-of-the-art baseline techniques such as
+nnUNet, nnDetection and LENS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RS<span class="highlight-title">GPT</span>: A Remote Sensing Vision Language Model and Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Hu, Jianlong Yuan, Congcong Wen, Xiaonan Lu, Xiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large-scale large language models, with GPT-4 as a prominent
+example, has significantly propelled the rapid advancement of artificial
+general intelligence and sparked the revolution of Artificial Intelligence 2.0.
+In the realm of remote sensing (RS), there is a growing interest in developing
+large vision language models (VLMs) specifically tailored for data analysis in
+this domain. However, current research predominantly revolves around visual
+recognition tasks, lacking comprehensive, large-scale image-text datasets that
+are aligned and suitable for training large VLMs, which poses significant
+challenges to effectively training such models for RS applications. In computer
+vision, recent research has demonstrated that fine-tuning large vision language
+models on small-scale, high-quality datasets can yield impressive performance
+in visual and language understanding. These results are comparable to
+state-of-the-art VLMs trained from scratch on massive amounts of data, such as
+GPT-4. Inspired by this captivating idea, in this work, we build a high-quality
+Remote Sensing Image Captioning dataset (RSICap) that facilitates the
+development of large VLMs in the RS field. Unlike previous RS datasets that
+either employ model-generated captions or short descriptions, RSICap comprises
+2,585 human-annotated captions with rich and high-quality information. This
+dataset offers detailed descriptions for each image, encompassing scene
+descriptions (e.g., residential area, airport, or farmland) as well as object
+information (e.g., color, shape, quantity, absolute position, etc). To
+facilitate the evaluation of VLMs in the field of RS, we also provide a
+benchmark evaluation dataset called RSIEval. This dataset consists of
+human-annotated captions and visual question-answer pairs, allowing for a
+comprehensive assessment of VLMs in the context of RS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning with Constraint Learning: New Perspective, Solution Strategy
+  and Various Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15257v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15257v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Risheng Liu, Jiaxin Gao, Xuan Liu, Xin Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The complexity of learning problems, such as Generative Adversarial Network
+(GAN) and its variants, multi-task and meta-learning, hyper-parameter learning,
+and a variety of real-world vision applications, demands a deeper understanding
+of their underlying coupling mechanisms. Existing approaches often address
+these problems in isolation, lacking a unified perspective that can reveal
+commonalities and enable effective solutions. Therefore, in this work, we
+proposed a new framework, named Learning with Constraint Learning (LwCL), that
+can holistically examine challenges and provide a unified methodology to tackle
+all the above-mentioned complex learning and vision problems. Specifically,
+LwCL is designed as a general hierarchical optimization model that captures the
+essence of these diverse learning and vision problems. Furthermore, we develop
+a gradient-response based fast solution strategy to overcome optimization
+challenges of the LwCL framework. Our proposed framework efficiently addresses
+a wide range of applications in learning and vision, encompassing three
+categories and nine different problem types. Extensive experiments on synthetic
+tasks and real-world applications verify the effectiveness of our approach. The
+LwCL framework offers a comprehensive solution for tackling complex machine
+learning and computer vision problems, bridging the gap between theory and
+practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiple Instance Learning Framework with Masked Hard Instance Mining
+  for Whole Slide Image Classification <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15254v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15254v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhao Tang, Sheng Huang, Xiaoxian Zhang, Fengtao Zhou, Yi Zhang, Bo Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The whole slide image (WSI) classification is often formulated as a multiple
+instance learning (MIL) problem. Since the positive tissue is only a small
+fraction of the gigapixel WSI,existing MIL methods intuitively focus on
+identifying salient instances via attention mechanisms. However, this leads to
+a bias towards easy-to-classify instances while neglecting hard-to-classify
+instances.Some literature has revealed that hard examples are beneficial for
+modeling a discriminative boundary accurately.By applying such an idea at the
+instance level,we elaborate a novel MIL framework with masked hard instance
+mining (MHIM-MIL), which uses a Siamese structure (Teacher-Student) with a
+consistency constraint to explore the potential hard instances. With several
+instance masking strategies based on attention scores, MHIM-MIL employs a
+momentum teacher to implicitly mine hard instances for training the student
+model, which can be any attention-based MIL model.This counter-intuitive
+strategy essentially enables the student to learn a better discriminating
+boundary.Moreover, the student is used to update the teacher with an
+exponential moving average (EMA), which in turn identifies new hard instances
+for subsequent training iterations and stabilizes the optimization.Experimental
+results on the CAMELYON-16 and TCGA Lung Cancer datasets demonstrate that
+MHIM-MIL outperforms other latest methods in terms of performance and training
+cost. The code is available at:https://github.com/DearCaat/MHIM-MIL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Solution to Co-occurrence Bias: Attributes Disentanglement via Mutual
+  Information Minimization for Pedestrian Attribute Recognition <span class="chip">IJCAI23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15252v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15252v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yibo Zhou, Hai-Miao Hu, Jinzuo Yu, Zhenbo Xu, Weiqing Lu, Yuran Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies on pedestrian attribute recognition progress with either
+explicit or implicit modeling of the co-occurrence among attributes.
+Considering that this known a prior is highly variable and unforeseeable
+regarding the specific scenarios, we show that current methods can actually
+suffer in generalizing such fitted attributes interdependencies onto scenes or
+identities off the dataset distribution, resulting in the underlined bias of
+attributes co-occurrence. To render models robust in realistic scenes, we
+propose the attributes-disentangled feature learning to ensure the recognition
+of an attribute not inferring on the existence of others, and which is
+sequentially formulated as a problem of mutual information minimization.
+Rooting from it, practical strategies are devised to efficiently decouple
+attributes, which substantially improve the baseline and establish
+state-of-the-art performance on realistic datasets like PETAzs and RAPzs. Code
+is released on
+https://github.com/SDret/A-Solution-to-Co-occurence-Bias-in-Pedestrian-Attribute-Recognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IJCAI23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ D2S: Representing local descriptors and global scene coordinates for
+  camera relocalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15250v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15250v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bach-Thuan Bui, Dinh-Tuan Tran, Joo-Ho Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art visual localization methods mostly rely on complex
+procedures to match local descriptors and 3D point clouds. However, these
+procedures can incur significant cost in terms of inference, storage, and
+updates over time. In this study, we propose a direct learning-based approach
+that utilizes a simple network named D2S to represent local descriptors and
+their scene coordinates. Our method is characterized by its simplicity and
+cost-effectiveness. It solely leverages a single RGB image for localization
+during the testing phase and only requires a lightweight model to encode a
+complex sparse scene. The proposed D2S employs a combination of a simple loss
+function and graph attention to selectively focus on robust descriptors while
+disregarding areas such as clouds, trees, and several dynamic objects. This
+selective attention enables D2S to effectively perform a binary-semantic
+classification for sparse descriptors. Additionally, we propose a new outdoor
+dataset to evaluate the capabilities of visual localization methods in terms of
+scene generalization and self-updating from unlabeled observations. Our
+approach outperforms the state-of-the-art CNN-based methods in scene coordinate
+regression in indoor and outdoor environments. It demonstrates the ability to
+generalize beyond training data, including scenarios involving transitions from
+day to night and adapting to domain shifts, even in the absence of the labeled
+data sources. The source code, trained models, dataset, and demo videos are
+available at the following link: https://thpjp.github.io/d2s
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TROPHY: A Topologically Robust Physics-Informed Tracking Framework for
+  Tropical Cyclones 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Yan, Hanqi Guo, Thomas Peterka, Bei Wang, Jiali Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tropical cyclones (TCs) are among the most destructive weather systems.
+Realistically and efficiently detecting and tracking TCs are critical for
+assessing their impacts and risks. Recently, a multilevel robustness framework
+has been introduced to study the critical points of time-varying vector fields.
+The framework quantifies the robustness of critical points across varying
+neighborhoods. By relating the multilevel robustness with critical point
+tracking, the framework has demonstrated its potential in cyclone tracking. An
+advantage is that it identifies cyclonic features using only 2D wind vector
+fields, which is encouraging as most tracking algorithms require multiple
+dynamic and thermodynamic variables at different altitudes. A disadvantage is
+that the framework does not scale well computationally for datasets containing
+a large number of cyclones. This paper introduces a topologically robust
+physics-informed tracking framework (TROPHY) for TC tracking. The main idea is
+to integrate physical knowledge of TC to drastically improve the computational
+efficiency of multilevel robustness framework for large-scale climate datasets.
+First, during preprocessing, we propose a physics-informed feature selection
+strategy to filter 90% of critical points that are short-lived and have low
+stability, thus preserving good candidates for TC tracking. Second, during
+in-processing, we impose constraints during the multilevel robustness
+computation to focus only on physics-informed neighborhoods of TCs. We apply
+TROPHY to 30 years of 2D wind fields from reanalysis data in ERA5 and generate
+a number of TC tracks. In comparison with the observed tracks, we demonstrate
+that TROPHY can capture TC characteristics that are comparable to and sometimes
+even better than a well-validated TC tracking algorithm that requires multiple
+dynamic and thermodynamic scalar fields.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Neural Networks based Meta-Learning for Network Intrusion Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09394v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09394v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anabia Sohail, Bibi Ayisha, Irfan Hameed, Muhammad Mohsin Zafar, Hani Alquhayz, Asifullah Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The digitization of different components of industry and inter-connectivity
+among indigenous networks have increased the risk of network attacks. Designing
+an intrusion detection system to ensure security of the industrial ecosystem is
+difficult as network traffic encompasses various attack types, including new
+and evolving ones with minor changes. The data used to construct a predictive
+model for computer networks has a skewed class distribution and limited
+representation of attack types, which differ from real network traffic. These
+limitations result in dataset shift, negatively impacting the machine learning
+models' predictive abilities and reducing the detection rate against novel
+attacks. To address the challenges, we propose a novel deep neural network
+based Meta-Learning framework; INformation FUsion and Stacking Ensemble
+(INFUSE) for network intrusion detection. First, a hybrid feature space is
+created by integrating decision and feature spaces. Five different classifiers
+are utilized to generate a pool of decision spaces. The feature space is then
+enriched through a deep sparse autoencoder that learns the semantic
+relationships between attacks. Finally, the deep Meta-Learner acts as an
+ensemble combiner to analyze the hybrid feature space and make a final
+decision. Our evaluation on stringent benchmark datasets and comparison to
+existing techniques showed the effectiveness of INFUSE with an F-Score of 0.91,
+Accuracy of 91.6%, and Recall of 0.94 on the Test+ dataset, and an F-Score of
+0.91, Accuracy of 85.6%, and Recall of 0.87 on the stringent Test-21 dataset.
+These promising results indicate the strong generalization capability and the
+potential to detect network attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pages: 15, Figures: 10 and Tables: 9</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-Supervised</span> <span class="highlight-title">Pre-train</span>ing for 3D Point Clouds via View-Specific
+  Point-to-Image Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.14197v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.14197v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qijian Zhang, Junhui Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The past few years have witnessed the great success and prevalence of
+self-supervised representation learning within the language and 2D vision
+communities. However, such advancements have not been fully migrated to the
+field of 3D point cloud learning. Different from existing pre-training
+paradigms designed for deep point cloud feature extractors that fall into the
+scope of generative modeling or contrastive learning, this paper proposes a
+translative pre-training framework, namely PointVST, driven by a novel
+self-supervised pretext task of cross-modal translation from 3D point clouds to
+their corresponding diverse forms of 2D rendered images. More specifically, we
+begin with deducing view-conditioned point-wise embeddings through the
+insertion of the viewpoint indicator, and then adaptively aggregate a
+view-specific global codeword, which can be further fed into subsequent 2D
+convolutional translation heads for image generation. Extensive experimental
+evaluations on various downstream task scenarios demonstrate that our PointVST
+shows consistent and prominent performance superiority over current
+state-of-the-art approaches as well as satisfactory domain transfer capability.
+Our code will be publicly available at https://github.com/keeganhk/PointVST.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Infrared Blocks: A Multi-view Black-box Attack to Thermal
+  Infrared Detectors in Physical World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.10712v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.10712v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengyin Hu, Weiwen Shi, Tingsong Jiang, Wen Yao, Ling Tian, Xiaoqian Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infrared imaging systems have a vast array of potential applications in
+pedestrian detection and autonomous driving, and their safety performance is of
+great concern. However, few studies have explored the safety of infrared
+imaging systems in real-world settings. Previous research has used physical
+perturbations such as small bulbs and thermal "QR codes" to attack infrared
+imaging detectors, but such methods are highly visible and lack stealthiness.
+Other researchers have used hot and cold blocks to deceive infrared imaging
+detectors, but this method is limited in its ability to execute attacks from
+various angles. To address these shortcomings, we propose a novel physical
+attack called adversarial infrared blocks (AdvIB). By optimizing the physical
+parameters of the adversarial infrared blocks, this method can execute a
+stealthy black-box attack on thermal imaging system from various angles. We
+evaluate the proposed method based on its effectiveness, stealthiness, and
+robustness. Our physical tests show that the proposed method achieves a success
+rate of over 80% under most distance and angle conditions, validating its
+effectiveness. For stealthiness, our method involves attaching the adversarial
+infrared block to the inside of clothing, enhancing its stealthiness.
+Additionally, we test the proposed method on advanced detectors, and
+experimental results demonstrate an average attack success rate of 51.2%,
+proving its robustness. Overall, our proposed AdvIB method offers a promising
+avenue for conducting stealthy, effective and robust black-box attacks on
+thermal imaging system, with potential implications for real-world safety and
+security applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EmoSet: A Large-scale Visual Emotion <span class="highlight-title">Dataset</span> with Rich Attributes <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07961v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07961v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyuan Yang, Qirui Huang, Tingting Ding, Dani Lischinski, Daniel Cohen-Or, Hui Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Emotion Analysis (VEA) aims at predicting people's emotional responses
+to visual stimuli. This is a promising, yet challenging, task in affective
+computing, which has drawn increasing attention in recent years. Most of the
+existing work in this area focuses on feature design, while little attention
+has been paid to dataset construction. In this work, we introduce EmoSet, the
+first large-scale visual emotion dataset annotated with rich attributes, which
+is superior to existing datasets in four aspects: scale, annotation richness,
+diversity, and data balance. EmoSet comprises 3.3 million images in total, with
+118,102 of these images carefully labeled by human annotators, making it five
+times larger than the largest existing dataset. EmoSet includes images from
+social networks, as well as artistic images, and it is well balanced between
+different emotion categories. Motivated by psychological studies, in addition
+to emotion category, each image is also annotated with a set of describable
+emotion attributes: brightness, colorfulness, scene type, object class, facial
+expression, and human action, which can help understand visual emotions in a
+precise and interpretable way. The relevance of these emotion attributes is
+validated by analyzing the correlations between them and visual emotion, as
+well as by designing an attribute module to help visual emotion recognition. We
+believe EmoSet will bring some key insights and encourage further research in
+visual emotion analysis and understanding. Project page:
+https://vcc.tech/EmoSet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV2023, similar to the final version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CN-Celeb-AV: A Multi-Genre Audio-Visual <span class="highlight-title">Dataset</span> for Person Recognition <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16049v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16049v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lantian Li, Xiaolou Li, Haoyu Jiang, Chen Chen, Ruihai Hou, Dong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-visual person recognition (AVPR) has received extensive attention.
+However, most datasets used for AVPR research so far are collected in
+constrained environments, and thus cannot reflect the true performance of AVPR
+systems in real-world scenarios. To meet the request for research on AVPR in
+unconstrained conditions, this paper presents a multi-genre AVPR dataset
+collected `in the wild', named CN-Celeb-AV. This dataset contains more than
+419k video segments from 1,136 persons from public media. In particular, we put
+more emphasis on two real-world complexities: (1) data in multiple genres; (2)
+segments with partial information. A comprehensive study was conducted to
+compare CN-Celeb-AV with two popular public AVPR benchmark datasets, and the
+results demonstrated that CN-Celeb-AV is more in line with real-world scenarios
+and can be regarded as a new benchmark dataset for AVPR research. The dataset
+also involves a development set that can be used to boost the performance of
+AVPR systems in real-life situations. The dataset is free for researchers and
+can be downloaded from http://cnceleb.org/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>INTERSPEECH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Embrace Limited and Imperfect Training <span class="highlight-title">Dataset</span>s: Opportunities and
+  Challenges in Plant Disease Recognition Using Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11533v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11533v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingle Xu, Hyongsuk Kim, Jucheng Yang, Alvaro Fuentes, Yao Meng, Sook Yoon, Taehyun Kim, Dong Sun Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in deep learning have brought significant improvements to
+plant disease recognition. However, achieving satisfactory performance often
+requires high-quality training datasets, which are challenging and expensive to
+collect. Consequently, the practical application of current deep learning-based
+methods in real-world scenarios is hindered by the scarcity of high-quality
+datasets. In this paper, we argue that embracing poor datasets is viable and
+aim to explicitly define the challenges associated with using these datasets.
+To delve into this topic, we analyze the characteristics of high-quality
+datasets, namely large-scale images and desired annotation, and contrast them
+with the \emph{limited} and \emph{imperfect} nature of poor datasets.
+Challenges arise when the training datasets deviate from these characteristics.
+To provide a comprehensive understanding, we propose a novel and informative
+taxonomy that categorizes these challenges. Furthermore, we offer a brief
+overview of existing studies and approaches that address these challenges. We
+believe that our paper sheds light on the importance of embracing poor
+datasets, enhances the understanding of the associated challenges, and
+contributes to the ambitious objective of deploying deep learning in real-world
+applications. To facilitate the progress, we finally describe several
+outstanding questions and point out potential future directions. Although our
+primary focus is on plant disease recognition, we emphasize that the principles
+of embracing and analyzing poor datasets are applicable to a wider range of
+domains, including agriculture.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>revision v1 in perspetive style</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Multimodal Prediction of Spontaneous Humour: A Novel <span class="highlight-title">Dataset</span> and
+  First Results 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.14272v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.14272v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Christ, Shahin Amiriparian, Alexander Kathan, Niklas Müller, Andreas König, Björn W. Schuller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humour is a substantial element of human affect and cognition. Its automatic
+understanding can facilitate a more naturalistic human-device interaction and
+the humanisation of artificial intelligence. Current methods of humour
+detection are solely based on staged data making them inadequate for
+'real-world' applications. We address this deficiency by introducing the novel
+Passau-Spontaneous Football Coach Humour (Passau-SFCH) dataset, comprising of
+about 11 hours of recordings. The Passau-SFCH dataset is annotated for the
+presence of humour and its dimensions (sentiment and direction) as proposed in
+Martin's Humor Style Questionnaire. We conduct a series of experiments,
+employing pretrained Transformers, convolutional neural networks, and
+expert-designed features. The performance of each modality (text, audio, video)
+for spontaneous humour recognition is analysed and their complementarity is
+investigated. Our findings suggest that for the automatic analysis of humour
+and its sentiment, facial expressions are most promising, while humour
+direction can be best modelled via text-based features. The results reveal
+considerable differences among various subjects, highlighting the individuality
+of humour usage and style. Further, we observe that a decision-level fusion
+yields the best recognition result. Finally, we make our code publicly
+available at https://www.github.com/EIHW/passau-sfch. The Passau-SFCH dataset
+is available upon request.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible (Major Revision)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Complex-valued Retrievals From Noisy Images Using Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.03235v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.03235v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nadav Torem, Roi Ronen, Yoav Y. Schechner, Michael Elad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In diverse microscopy modalities, sensors measure only real-valued
+intensities. Additionally, the sensor readouts are affected by
+Poissonian-distributed photon noise. Traditional restoration algorithms
+typically aim to minimize the mean squared error (MSE) between the original and
+recovered images. This often leads to blurry outcomes with poor perceptual
+quality. Recently, deep diffusion models (DDMs) have proven to be highly
+capable of sampling images from the a-posteriori probability of the sought
+variables, resulting in visually pleasing high-quality images. These models
+have mostly been suggested for real-valued images suffering from Gaussian
+noise. In this study, we generalize annealed Langevin Dynamics, a type of DDM,
+to tackle the fundamental challenges in optical imaging of complex-valued
+objects (and real images) affected by Poisson noise. We apply our algorithm to
+various optical scenarios, such as Fourier Ptychography, Phase Retrieval, and
+Poisson denoising. Our algorithm is evaluated on simulations and biological
+empirical data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dense <span class="highlight-title">Transformer</span> based Enhanced Coding Network for Unsupervised Metal
+  Artifact Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12717v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12717v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wangduo Xie, Matthew B. Blaschko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CT images corrupted by metal artifacts have serious negative effects on
+clinical diagnosis. Considering the difficulty of collecting paired data with
+ground truth in clinical settings, unsupervised methods for metal artifact
+reduction are of high interest. However, it is difficult for previous
+unsupervised methods to retain structural information from CT images while
+handling the non-local characteristics of metal artifacts. To address these
+challenges, we proposed a novel Dense Transformer based Enhanced Coding Network
+(DTEC-Net) for unsupervised metal artifact reduction. Specifically, we
+introduce a Hierarchical Disentangling Encoder, supported by the high-order
+dense process, and transformer to obtain densely encoded sequences with
+long-range correspondence. Then, we present a second-order disentanglement
+method to improve the dense sequence's decoding process. Extensive experiments
+and model discussions illustrate DTEC-Net's effectiveness, which outperforms
+the previous state-of-the-art methods on a benchmark dataset, and greatly
+reduces metal artifacts while restoring richer texture details.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Audiovisual Masked Autoencoders <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05922v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05922v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mariana-Iuliana Georgescu, Eduardo Fonseca, Radu Tudor Ionescu, Mario Lucic, Cordelia Schmid, Anurag Arnab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Can we leverage the audiovisual information already present in video to
+improve self-supervised representation learning? To answer this question, we
+study various pretraining architectures and objectives within the masked
+autoencoding framework, motivated by the success of similar methods in natural
+language and image understanding. We show that we can achieve significant
+improvements on audiovisual downstream classification tasks, surpassing the
+state-of-the-art on VGGSound and AudioSet. Furthermore, we can leverage our
+audiovisual pretraining scheme for multiple unimodal downstream tasks using a
+single audiovisual pretrained model. We additionally demonstrate the
+transferability of our representations, achieving state-of-the-art audiovisual
+results on Epic Kitchens without pretraining specifically for this dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimizing Convolutional Neural Networks for Chronic Obstructive
+  Pulmonary Disease Detection in Clinical Computed Tomography Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07189v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07189v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tina Dorosti, Manuel Schultheiss, Felix Hofmann, Johannes Thalhammer, Luisa Kirchner, Theresa Urban, Franz Pfeiffer, Florian Schaff, Tobias Lasser, Daniela Pfeiffer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: To optimize the binary detection of Chronic Obstructive Pulmonary
+Disease (COPD) based on emphysema presence in the lung with convolutional
+neural networks (CNN) by exploring manually adjusted versus automated
+window-setting optimization (WSO) on computed tomography (CT) images.
+  Methods: 7,194 CT images (3,597 with COPD; 3,597 healthy controls) from 78
+subjects (43 with COPD; 35 healthy controls) were selected retrospectively
+(10.2018-12.2019) and preprocessed. For each image, intensity values were
+manually clipped to the emphysema window setting and a baseline 'full-range'
+window setting. Class-balanced train, validation, and test sets contained
+3,392, 1,114, and 2,688 images. The network backbone was optimized by comparing
+various CNN architectures. Furthermore, automated WSO was implemented by adding
+a customized layer to the model. The image-level area under the Receiver
+Operating Characteristics curve (AUC) [lower, upper limit 95% confidence] and
+P-values calculated from one-sided Mann-Whitney U-test were utilized to compare
+model variations.
+  Results: Repeated inference (n=7) on the test set showed that the DenseNet
+was the most efficient backbone and achieved a mean AUC of 0.80 [0.76, 0.85]
+without WSO. Comparably, with input images manually adjusted to the emphysema
+window, the DenseNet model predicted COPD with a mean AUC of 0.86 [0.82, 0.89]
+(P=0.03). By adding a customized WSO layer to the DenseNet, an optimal window
+in the proximity of the emphysema window setting was learned automatically, and
+a mean AUC of 0.82 [0.78, 0.86] was achieved.
+  Conclusion: Detection of COPD with DenseNet models was improved by WSO of CT
+data to the emphysema window setting range.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3M3D: Multi-view, Multi-path, Multi-representation for 3D Object
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08231v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08231v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jongwoo Park, Apoorv Singh, Varun Bankiti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D visual perception tasks based on multi-camera images are essential for
+autonomous driving systems. Latest work in this field performs 3D object
+detection by leveraging multi-view images as an input and iteratively enhancing
+object queries (object proposals) by cross-attending multi-view features.
+However, individual backbone features are not updated with multi-view features
+and it stays as a mere collection of the output of the single-image backbone
+network. Therefore we propose 3M3D: A Multi-view, Multi-path,
+Multi-representation for 3D Object Detection where we update both multi-view
+features and query features to enhance the representation of the scene in both
+fine panoramic view and coarse global view. Firstly, we update multi-view
+features by multi-view axis self-attention. It will incorporate panoramic
+information in the multi-view features and enhance understanding of the global
+scene. Secondly, we update multi-view features by self-attention of the ROI
+(Region of Interest) windows which encodes local finer details in the features.
+It will help exchange the information not only along the multi-view axis but
+also along the other spatial dimension. Lastly, we leverage the fact of
+multi-representation of queries in different domains to further boost the
+performance. Here we use sparse floating queries along with dense BEV (Bird's
+Eye View) queries, which are later post-processed to filter duplicate
+detections. Moreover, we show performance improvements on nuScenes benchmark
+dataset on top of our baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Generalizable Deepfake Detection by Primary Region
+  Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12534v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12534v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harry Cheng, Yangyang Guo, Tianyi Wang, Liqiang Nie, Mohan Kankanhalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The existing deepfake detection methods have reached a bottleneck in
+generalizing to unseen forgeries and manipulation approaches. Based on the
+observation that the deepfake detectors exhibit a preference for overfitting
+the specific primary regions in input, this paper enhances the generalization
+capability from a novel regularization perspective. This can be simply achieved
+by augmenting the images through primary region removal, thereby preventing the
+detector from over-relying on data bias. Our method consists of two stages,
+namely the static localization for primary region maps, as well as the dynamic
+exploitation of primary region masks. The proposed method can be seamlessly
+integrated into different backbones without affecting their inference
+efficiency. We conduct extensive experiments over three widely used deepfake
+datasets - DFDC, DF-1.0, and Celeb-DF with five backbones. Our method
+demonstrates an average performance improvement of 6% across different
+backbones and performs competitively with several state-of-the-art baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages. v2 corrected one minor citation error. Code and Dataset:
+  https://github.com/xaCheng1996/PRLE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SACReg: Scene-Agnostic Coordinate Regression for Visual Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11702v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11702v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jerome Revaud, Yohann Cabon, Romain Brégier, JongMin Lee, Philippe Weinzaepfel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene coordinates regression (SCR), i.e., predicting 3D coordinates for every
+pixel of a given image, has recently shown promising potential. However,
+existing methods remain mostly scene-specific or limited to small scenes and
+thus hardly scale to realistic datasets. In this paper, we propose a new
+paradigm where a single generic SCR model is trained once to be then deployed
+to new test scenes, regardless of their scale and without further finetuning.
+For a given query image, it collects inputs from off-the-shelf image retrieval
+techniques and Structure-from-Motion databases: a list of relevant database
+images with sparse pointwise 2D-3D annotations. The model is based on the
+transformer architecture and can take a variable number of images and sparse
+2D-3D annotations as input. It is trained on a few diverse datasets and
+significantly outperforms other scene regression approaches on several
+benchmarks, including scene-specific models, for visual localization. In
+particular, we set a new state of the art on the Cambridge localization
+benchmark, even outperforming feature-matching-based approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hybrid Open-set Segmentation with Synthetic Negative Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08555v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08555v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matej Grcić, Siniša Šegvić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-set segmentation is often conceived by complementing closed-set
+classification with anomaly detection. Existing dense anomaly detectors operate
+either through generative modelling of regular training data or by
+discriminating with respect to negative training data. These two approaches
+optimize different objectives and therefore exhibit different failure modes.
+Consequently, we propose the first dense hybrid anomaly score that fuses
+generative and discriminative cues. The proposed score can be efficiently
+implemented by upgrading any semantic segmentation model with dense estimates
+of data likelihood and dataset posterior. Our design is a remarkably good fit
+for efficient inference on large images due to negligible computational
+overhead over the closed-set baseline. The resulting dense hybrid open-set
+models require negative training images that can be sampled from an auxiliary
+negative dataset, from a jointly trained generative model, or from a mixture of
+both sources. We evaluate our contributions on benchmarks for dense anomaly
+detection and open-set segmentation. The experiments reveal strong open-set
+performance in spite of negligible computational overhead.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Conditional Flow Variational Autoencoder for Controllable Synthesis of
+  Virtual Populations of Anatomy <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14680v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14680v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Dou, Nishant Ravikumar, Alejandro F. Frangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generation of virtual populations (VPs) of anatomy is essential for
+conducting in silico trials of medical devices. Typically, the generated VP
+should capture sufficient variability while remaining plausible and should
+reflect the specific characteristics and demographics of the patients observed
+in real populations. In several applications, it is desirable to synthesise
+virtual populations in a \textit{controlled} manner, where relevant covariates
+are used to conditionally synthesise virtual populations that fit a specific
+target population/characteristics. We propose to equip a conditional
+variational autoencoder (cVAE) with normalising flows to boost the flexibility
+and complexity of the approximate posterior learnt, leading to enhanced
+flexibility for controllable synthesis of VPs of anatomical structures. We
+demonstrate the performance of our conditional flow VAE using a data set of
+cardiac left ventricles acquired from 2360 patients, with associated
+demographic information and clinical measurements (used as
+covariates/conditional information). The results obtained indicate the
+superiority of the proposed method for conditional synthesis of virtual
+populations of cardiac left ventricles relative to a cVAE. Conditional
+synthesis performance was evaluated in terms of generalisation and specificity
+errors and in terms of the ability to preserve clinically relevant biomarkers
+in synthesised VPs, that is, the left ventricular blood pool and myocardial
+volume, relative to the real observed population.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards a performance analysis on <span class="highlight-title">pre-train</span>ed Visual Question Answering
+  models for autonomous driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09329v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09329v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaavya Rekanar, Ciarán Eising, Ganesh Sistu, Martin Hayes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This short paper presents a preliminary analysis of three popular Visual
+Question Answering (VQA) models, namely ViLBERT, ViLT, and LXMERT, in the
+context of answering questions relating to driving scenarios. The performance
+of these models is evaluated by comparing the similarity of responses to
+reference answers provided by computer vision experts. Model selection is
+predicated on the analysis of transformer utilization in multimodal
+architectures. The results indicate that models incorporating cross-modal
+attention and late fusion techniques exhibit promising potential for generating
+improved answers within a driving perspective. This initial analysis serves as
+a launchpad for a forthcoming comprehensive comparative study involving nine
+VQA models and sets the scene for further investigations into the effectiveness
+of VQA model queries in self-driving scenarios. Supplementary material is
+available at
+https://github.com/KaavyaRekanar/Towards-a-performance-analysis-on-pre-trained-VQA-models-for-autonomous-driving.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Near Field iToF LIDAR Depth Improvement from Limited Number of Shots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07047v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07047v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mena Nagiub, Thorsten Beuth, Ganesh Sistu, Heinrich Gotzig, Ciarán Eising
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Indirect Time of Flight LiDARs can indirectly calculate the scene's depth
+from the phase shift angle between transmitted and received laser signals with
+amplitudes modulated at a predefined frequency. Unfortunately, this method
+generates ambiguity in calculated depth when the phase shift angle value
+exceeds $2\pi$. Current state-of-the-art methods use raw samples generated
+using two distinct modulation frequencies to overcome this ambiguity problem.
+However, this comes at the cost of increasing laser components' stress and
+raising their temperature, which reduces their lifetime and increases power
+consumption. In our work, we study two different methods to recover the entire
+depth range of the LiDAR using fewer raw data sample shots from a single
+modulation frequency with the support of sensor's gray scale output to reduce
+the laser components' stress and power consumption.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AMAE: Adaptation of <span class="highlight-title">Pre-Train</span>ed Masked Autoencoder for Dual-Distribution
+  Anomaly Detection in Chest X-Rays <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12721v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12721v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Behzad Bozorgtabar, Dwarikanath Mahapatra, Jean-Philippe Thiran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised anomaly detection in medical images such as chest radiographs is
+stepping into the spotlight as it mitigates the scarcity of the labor-intensive
+and costly expert annotation of anomaly data. However, nearly all existing
+methods are formulated as a one-class classification trained only on
+representations from the normal class and discard a potentially significant
+portion of the unlabeled data. This paper focuses on a more practical setting,
+dual distribution anomaly detection for chest X-rays, using the entire training
+data, including both normal and unlabeled images. Inspired by a modern
+self-supervised vision transformer model trained using partial image inputs to
+reconstruct missing image regions -- we propose AMAE, a two-stage algorithm for
+adaptation of the pre-trained masked autoencoder (MAE). Starting from MAE
+initialization, AMAE first creates synthetic anomalies from only normal
+training images and trains a lightweight classifier on frozen transformer
+features. Subsequently, we propose an adaptation strategy to leverage unlabeled
+images containing anomalies. The adaptation scheme is accomplished by assigning
+pseudo-labels to unlabeled images and using two separate MAE based modules to
+model the normative and anomalous distributions of pseudo-labeled images. The
+effectiveness of the proposed adaptation strategy is evaluated with different
+anomaly ratios in an unlabeled training set. AMAE leads to consistent
+performance gains over competing self-supervised and dual distribution anomaly
+detection methods, setting the new state-of-the-art on three public chest X-ray
+benchmarks: RSNA, NIH-CXR, and VinDr-CXR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural-PBIR Reconstruction of Shape, Material, and Illumination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13445v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13445v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Sun, Guangyan Cai, Zhengqin Li, Kai Yan, Cheng Zhang, Carl Marshall, Jia-Bin Huang, Shuang Zhao, Zhao Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing the shape and spatially varying surface appearances of a
+physical-world object as well as its surrounding illumination based on 2D
+images (e.g., photographs) of the object has been a long-standing problem in
+computer vision and graphics. In this paper, we introduce a robust object
+reconstruction pipeline combining neural based object reconstruction and
+physics-based inverse rendering (PBIR). Specifically, our pipeline firstly
+leverages a neural stage to produce high-quality but potentially imperfect
+predictions of object shape, reflectance, and illumination. Then, in the later
+stage, initialized by the neural predictions, we perform PBIR to refine the
+initial results and obtain the final high-quality reconstruction. Experimental
+results demonstrate our pipeline significantly outperforms existing
+reconstruction methods quality-wise and performance-wise.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page at https://neural-pbir.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ End-to-end Learning of a Fisher Vector Encoding for Part Features in
+  Fine-grained Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2007.02080v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2007.02080v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitri Korsch, Paul Bodesheim, Joachim Denzler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Part-based approaches for fine-grained recognition do not show the expected
+performance gain over global methods, although explicitly focusing on small
+details that are relevant for distinguishing highly similar classes. We assume
+that part-based methods suffer from a missing representation of local features,
+which is invariant to the order of parts and can handle a varying number of
+visible parts appropriately. The order of parts is artificial and often only
+given by ground-truth annotations, whereas viewpoint variations and occlusions
+result in not observable parts. Therefore, we propose integrating a Fisher
+vector encoding of part features into convolutional neural networks. The
+parameters for this encoding are estimated by an online EM algorithm jointly
+with those of the neural network and are more precise than the estimates of
+previous works. Our approach improves state-of-the-art accuracies for three
+bird species classification datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the proceedings of the German Conference on Pattern
+  Recognition 2021 (GCPR21)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Minimum Latency Deep Online Video Stabilization <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.02073v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.02073v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuofan Zhang, Zhen Liu, Ping Tan, Bing Zeng, Shuaicheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel camera path optimization framework for the task of online
+video stabilization. Typically, a stabilization pipeline consists of three
+steps: motion estimating, path smoothing, and novel view rendering. Most
+previous methods concentrate on motion estimation, proposing various global or
+local motion models. In contrast, path optimization receives relatively less
+attention, especially in the important online setting, where no future frames
+are available. In this work, we adopt recent off-the-shelf high-quality deep
+motion models for motion estimation to recover the camera trajectory and focus
+on the latter two steps. Our network takes a short 2D camera path in a sliding
+window as input and outputs the stabilizing warp field of the last frame in the
+window, which warps the coming frame to its stabilized position. A hybrid loss
+is well-defined to constrain the spatial and temporal consistency. In addition,
+we build a motion dataset that contains stable and unstable motion pairs for
+the training. Extensive experiments demonstrate that our approach significantly
+outperforms state-of-the-art online methods both qualitatively and
+quantitatively and achieves comparable performance to offline methods. Our code
+and dataset are available at https://github.com/liuzhen03/NNDVS
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> Image Denoising with Downsampled Invariance Loss and
+  Conditional Blind-Spot Network <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09507v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09507v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeong Il Jang, Keuntek Lee, Gu Yong Park, Seyun Kim, Nam Ik Cho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There have been many image denoisers using deep neural networks, which
+outperform conventional model-based methods by large margins. Recently,
+self-supervised methods have attracted attention because constructing a large
+real noise dataset for supervised training is an enormous burden. The most
+representative self-supervised denoisers are based on blind-spot networks,
+which exclude the receptive field's center pixel. However, excluding any input
+pixel is abandoning some information, especially when the input pixel at the
+corresponding output position is excluded. In addition, a standard blind-spot
+network fails to reduce real camera noise due to the pixel-wise correlation of
+noise, though it successfully removes independently distributed synthetic
+noise. Hence, to realize a more practical denoiser, we propose a novel
+self-supervised training framework that can remove real noise. For this, we
+derive the theoretic upper bound of a supervised loss where the network is
+guided by the downsampled blinded output. Also, we design a conditional
+blind-spot network (C-BSN), which selectively controls the blindness of the
+network to use the center pixel information. Furthermore, we exploit a random
+subsampler to decorrelate noise spatially, making the C-BSN free of visual
+artifacts that were often seen in downsample-based methods. Extensive
+experiments show that the proposed C-BSN achieves state-of-the-art performance
+on real-world datasets as a self-supervised denoiser and shows qualitatively
+pleasing results without any post-processing or refinement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Annotation-free Image Captioning with Retrieval-augmented
+  Pseudo Sentence Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14750v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14750v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Li, Dongnan Liu, Heng Wang, Chaoyi Zhang, Weidong Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training an image captioner without annotated image-sentence pairs has gained
+traction in recent years. Previous approaches can be categorized into two
+strategies: crawling sentences from mismatching corpora and aligning them with
+the given images as pseudo annotations, or pre-training the captioner using
+external image-text pairs. However, the aligning setting seems to reach its
+performance limit due to the quality problem of pairs, and pre-training
+requires significant computational resources. To address these challenges, we
+propose a new strategy ``LPM + retrieval-augmented learning" where the prior
+knowledge from large pre-trained models (LPMs) is leveraged as supervision, and
+a retrieval process is integrated to further reinforce its effectiveness.
+Specifically, we introduce Retrieval-augmented Pseudo Sentence Generation
+(RaPSG), which adopts an efficient approach to retrieve highly relevant short
+region descriptions from the mismatching corpora and use them to generate a
+variety of pseudo sentences with distinct representations as well as high
+quality via LPMs. In addition, a fluency filter and a CLIP-guided training
+objective are further introduced to facilitate model optimization. Experimental
+results demonstrate that our method surpasses the SOTA pre-training model
+(Flamingo3B) by achieving a CIDEr score of 78.1 (+5.1) while utilizing only
+0.3% of its trainable parameters (1.3B VS 33M). Importantly, our approach
+eliminates the need of computationally expensive pre-training processes on
+external datasets (e.g., the requirement of 312M image-text pairs for
+Flamingo3B). We further show that with a simple extension, the generated pseudo
+sentences can be deployed as weak supervision to boost the 1% semi-supervised
+image caption benchmark up to 93.4 CIDEr score (+8.9) which showcases the
+versatility and effectiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PanoVPR: Towards Unified Perspective-to-Equirectangular Visual Place
+  Recognition via Sliding Windows across the Panoramic View <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.14095v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.14095v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ze Shi, Hao Shi, Kailun Yang, Zhe Yin, Yining Lin, Kaiwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual place recognition has gained significant attention in recent years as
+a crucial technology in autonomous driving and robotics. Currently, the two
+main approaches are the perspective view retrieval (P2P) paradigm and the
+equirectangular image retrieval (E2E) paradigm. However, it is practical and
+natural to assume that users only have consumer-grade pinhole cameras to obtain
+query perspective images and retrieve them in panoramic database images from
+map providers. To address this, we propose \textit{PanoVPR}, a
+perspective-to-equirectangular (P2E) visual place recognition framework that
+employs sliding windows to eliminate feature truncation caused by hard
+cropping. Specifically, PanoVPR slides windows over the entire equirectangular
+image and computes feature descriptors for each window, which are then compared
+to determine place similarity. Notably, our unified framework enables direct
+transfer of the backbone from P2P methods without any modification, supporting
+not only CNNs but also Transformers. To facilitate training and evaluation, we
+derive the Pitts250k-P2E dataset from the Pitts250k and establish YQ360, latter
+is the first P2E visual place recognition dataset collected by a mobile robot
+platform aiming to simulate real-world task scenarios better. Extensive
+experiments demonstrate that PanoVPR achieves state-of-the-art performance and
+obtains 3.8% and 8.0% performance gain on Pitts250k-P2E and YQ360 compared to
+the previous best method, respectively. Code and datasets will be publicly
+available at https://github.com/zafirshi/PanoVPR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ITSC 2023. Code and datasets will be made available at
+  https://github.com/zafirshi/PanoVPR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3DPPE: 3D Point Positional Encoding for Multi-Camera 3D Object Detection
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.14710v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.14710v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changyong Shu, JIajun Deng, Fisher Yu, Yifan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based methods have swept the benchmarks on 2D and 3D detection on
+images. Because tokenization before the attention mechanism drops the spatial
+information, positional encoding becomes critical for those methods. Recent
+works found that encodings based on samples of the 3D viewing rays can
+significantly improve the quality of multi-camera 3D object detection. We
+hypothesize that 3D point locations can provide more information than rays.
+Therefore, we introduce 3D point positional encoding, 3DPPE, to the 3D
+detection Transformer decoder. Although 3D measurements are not available at
+the inference time of monocular 3D object detection, 3DPPE uses predicted depth
+to approximate the real point positions. Our hybriddepth module combines direct
+and categorical depth to estimate the refined depth of each pixel. Despite the
+approximation, 3DPPE achieves 46.0 mAP and 51.4 NDS on the competitive nuScenes
+dataset, significantly outperforming encodings based on ray samples. We make
+the codes available at https://github.com/drilistbox/3DPPE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D VR Sketch Guided 3D Shape Prototyping and Exploration <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10830v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10830v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ling Luo, Pinaki Nath Chowdhury, Tao Xiang, Yi-Zhe Song, Yulia Gryaditskaya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D shape modeling is labor-intensive, time-consuming, and requires years of
+expertise. To facilitate 3D shape modeling, we propose a 3D shape generation
+network that takes a 3D VR sketch as a condition. We assume that sketches are
+created by novices without art training and aim to reconstruct geometrically
+realistic 3D shapes of a given category. To handle potential sketch ambiguity,
+our method creates multiple 3D shapes that align with the original sketch's
+structure. We carefully design our method, training the model step-by-step and
+leveraging multi-modal 3D shape representation to support training with limited
+training data. To guarantee the realism of generated 3D shapes we leverage the
+normalizing flow that models the distribution of the latent space of 3D shapes.
+To encourage the fidelity of the generated 3D shapes to an input sketch, we
+propose a dedicated loss that we deploy at different stages of the training
+process. The code is available at https://github.com/Rowl1ng/3Dsketch2shape.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LPN: Language-guided Prototypical Network for few-shot classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01515v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01515v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaihui Cheng, Chule Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot classification aims to adapt to new tasks with limited labeled
+examples. To fully use the accessible data, recent methods explore suitable
+measures for the similarity between the query and support images and better
+high-dimensional features with meta-training and pre-training strategies.
+However, the potential of multi-modality information has barely been explored,
+which may bring promising improvement for few-shot classification. In this
+paper, we propose a Language-guided Prototypical Network (LPN) for few-shot
+classification, which leverages the complementarity of vision and language
+modalities via two parallel branches. Concretely, to introduce language
+modality with limited samples in the visual task, we leverage a pre-trained
+text encoder to extract class-level text features directly from class names
+while processing images with a conventional image encoder. Then, a
+language-guided decoder is introduced to obtain text features corresponding to
+each image by aligning class-level features with visual features. In addition,
+to take advantage of class-level features and prototypes, we build a refined
+prototypical head that generates robust prototypes in the text branch for
+follow-up measurement. Finally, we aggregate the visual and text logits to
+calibrate the deviation of a single modality. Extensive experiments demonstrate
+the competitiveness of LPN against state-of-the-art methods on benchmark
+datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>results error in table 1, the last line</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Building More Robust Models with Frequency Bias <span class="chip">ICCV23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09763v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09763v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingwen Bu, Dong Huang, Heming Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The vulnerability of deep neural networks to adversarial samples has been a
+major impediment to their broad applications, despite their success in various
+fields. Recently, some works suggested that adversarially-trained models
+emphasize the importance of low-frequency information to achieve higher
+robustness. While several attempts have been made to leverage this frequency
+characteristic, they have all faced the issue that applying low-pass filters
+directly to input images leads to irreversible loss of discriminative
+information and poor generalizability to datasets with distinct frequency
+features. This paper presents a plug-and-play module called the Frequency
+Preference Control Module that adaptively reconfigures the low- and
+high-frequency components of intermediate feature representations, providing
+better utilization of frequency in robust learning. Empirical studies show that
+our proposed module can be easily incorporated into any adversarial training
+framework, further improving model robustness across different architectures
+and datasets. Additionally, experiments were conducted to examine how the
+frequency bias of robust models impacts the adversarial training process and
+its final robustness, revealing interesting insights.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain Adaptive Sim-to-Real Segmentation of Oropharyngeal Organs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10883v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10883v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guankun Wang, Tian-Ao Ren, Jiewen Lai, Long Bai, Hongliang Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-assisted transoral tracheal intubation (TI) necessitates using an
+endoscope that helps the physician insert a tracheal tube into the glottis
+instead of the esophagus. The growing trend of robotic-assisted TI would
+require a medical robot to distinguish anatomical features like an experienced
+physician which can be imitated by utilizing supervised deep-learning
+techniques. However, the real datasets of oropharyngeal organs are often
+inaccessible due to limited open-source data and patient privacy. In this work,
+we propose a domain adaptive Sim-to-Real framework called IoU-Ranking
+Blend-ArtFlow (IRB-AF) for image segmentation of oropharyngeal organs. The
+framework includes an image blending strategy called IoU-Ranking Blend (IRB)
+and style-transfer method ArtFlow. Here, IRB alleviates the problem of poor
+segmentation performance caused by significant datasets domain differences;
+while ArtFlow is introduced to reduce the discrepancies between datasets
+further. A virtual oropharynx image dataset generated by the SOFA framework is
+used as the learning subject for semantic segmentation to deal with the limited
+availability of actual endoscopic images. We adapted IRB-AF with the
+state-of-the-art domain adaptive segmentation models. The results demonstrate
+the superior performance of our approach in further improving the segmentation
+accuracy and training stability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The manuscript is accepted by Medical & Biological Engineering &
+  Computing. Code and dataset:
+  https://github.com/gkw0010/EISOST-Sim2Real-Dataset-Release</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SACSoN: Scalable Autonomous Control for Social Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01874v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01874v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noriaki Hirose, Dhruv Shah, Ajay Sridhar, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning provides a powerful tool for building socially compliant
+robotic systems that go beyond simple predictive models of human behavior. By
+observing and understanding human interactions from past experiences, learning
+can enable effective social navigation behaviors directly from data. In this
+paper, our goal is to develop methods for training policies for socially
+unobtrusive navigation, such that robots can navigate among humans in ways that
+don't disturb human behavior. We introduce a definition for such behavior based
+on the counterfactual perturbation of the human: if the robot had not intruded
+into the space, would the human have acted in the same way? By minimizing this
+counterfactual perturbation, we can induce robots to behave in ways that do not
+alter the natural behavior of humans in the shared space. Instantiating this
+principle requires training policies to minimize their effect on human
+behavior, and this in turn requires data that allows us to model the behavior
+of humans in the presence of robots. Therefore, our approach is based on two
+key contributions. First, we collect a large dataset where an indoor mobile
+robot interacts with human bystanders. Second, we utilize this dataset to train
+policies that minimize counterfactual perturbation. We provide supplementary
+videos and make publicly available the largest-of-its-kind visual navigation
+dataset on our project page.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 14 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">5</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Framework to Automatically Determine the Quality of Open Data Catalogs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15464v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15464v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Martinez-Gil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data catalogs play a crucial role in modern data-driven organizations by
+facilitating the discovery, understanding, and utilization of diverse data
+assets. However, ensuring their quality and reliability is complex, especially
+in open and large-scale data environments. This paper proposes a framework to
+automatically determine the quality of open data catalogs, addressing the need
+for efficient and reliable quality assessment mechanisms. Our framework can
+analyze various core quality dimensions, such as accuracy, completeness,
+consistency, scalability, and timeliness, offer several alternatives for the
+assessment of compatibility and similarity across such catalogs as well as the
+implementation of a set of non-core quality dimensions such as provenance,
+readability, and licensing. The goal is to empower data-driven organizations to
+make informed decisions based on trustworthy and well-curated data assets. The
+source code that illustrates our approach can be downloaded from
+https://www.github.com/jorge-martinez-gil/dataq/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Transparent Sequence Models with Model-Based Tree Markov Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15367v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15367v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chan Hsu, Wei-Chun Huang, Jun-Ting Wu, Chih-Yuan Li, Yihuang Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we address the interpretability issue in complex, black-box
+Machine Learning models applied to sequence data. We introduce the Model-Based
+tree Hidden Semi-Markov Model (MOB-HSMM), an inherently interpretable model
+aimed at detecting high mortality risk events and discovering hidden patterns
+associated with the mortality risk in Intensive Care Units (ICU). This model
+leverages knowledge distilled from Deep Neural Networks (DNN) to enhance
+predictive performance while offering clear explanations. Our experimental
+results indicate the improved performance of Model-Based trees (MOB trees) via
+employing LSTM for learning sequential patterns, which are then transferred to
+MOB trees. Integrating MOB trees with the Hidden Semi-Markov Model (HSMM) in
+the MOB-HSMM enables uncovering potential and explainable sequences using
+available information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Staging E-Commerce Products for Online Advertising using Retrieval
+  Assisted Image Generation <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueh-Ning Ku, Mikhail Kuznetsov, Shaunak Mishra, Paloma de Juan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online ads showing e-commerce products typically rely on the product images
+in a catalog sent to the advertising platform by an e-commerce platform. In the
+broader ads industry such ads are called dynamic product ads (DPA). It is
+common for DPA catalogs to be in the scale of millions (corresponding to the
+scale of products which can be bought from the e-commerce platform). However,
+not all product images in the catalog may be appealing when directly
+re-purposed as an ad image, and this may lead to lower click-through rates
+(CTRs). In particular, products just placed against a solid background may not
+be as enticing and realistic as a product staged in a natural environment. To
+address such shortcomings of DPA images at scale, we propose a generative
+adversarial network (GAN) based approach to generate staged backgrounds for
+un-staged product images. Generating the entire staged background is a
+challenging task susceptible to hallucinations. To get around this, we
+introduce a simpler approach called copy-paste staging using retrieval assisted
+GANs. In copy paste staging, we first retrieve (from the catalog) staged
+products similar to the un-staged input product, and then copy-paste the
+background of the retrieved product in the input image. A GAN based in-painting
+model is used to fill the holes left after this copy-paste operation. We show
+the efficacy of our copy-paste staging method via offline metrics, and human
+evaluation. In addition, we show how our staging approach can enable animations
+of moving products leading to a video ad from a product image.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in AdKDD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring the Carbon Footprint of Hugging Face's ML Models: A Repository
+  Mining Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11164v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11164v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joel Castaño, Silverio Martínez-Fernández, Xavier Franch, Justus Bogner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of machine learning (ML) systems has exacerbated their carbon
+footprint due to increased capabilities and model sizes. However, there is
+scarce knowledge on how the carbon footprint of ML models is actually measured,
+reported, and evaluated. In light of this, the paper aims to analyze the
+measurement of the carbon footprint of 1,417 ML models and associated datasets
+on Hugging Face, which is the most popular repository for pretrained ML models.
+The goal is to provide insights and recommendations on how to report and
+optimize the carbon efficiency of ML models. The study includes the first
+repository mining study on the Hugging Face Hub API on carbon emissions. This
+study seeks to answer two research questions: (1) how do ML model creators
+measure and report carbon emissions on Hugging Face Hub?, and (2) what aspects
+impact the carbon emissions of training ML models? The study yielded several
+key findings. These include a stalled proportion of carbon emissions-reporting
+models, a slight decrease in reported carbon footprint on Hugging Face over the
+past 2 years, and a continued dominance of NLP as the main application domain.
+Furthermore, the study uncovers correlations between carbon emissions and
+various attributes such as model size, dataset size, and ML application
+domains. These results highlight the need for software measurements to improve
+energy reporting practices and promote carbon-efficient model development
+within the Hugging Face community. In response to this issue, two
+classifications are proposed: one for categorizing models based on their carbon
+emission reporting practices and another for their carbon efficiency. The aim
+of these classification proposals is to foster transparency and sustainable
+model development within the ML community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 2023 ACM/IEEE International Symposium on Empirical
+  Software Engineering and Measurement (ESEM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Equivariant Contrastive Learning for Sequential Recommendation <span class="chip">RecSys 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.05290v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.05290v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peilin Zhou, Jingqi Gao, Yueqi Xie, Qichen Ye, Yining Hua, Jae Boum Kim, Shoujin Wang, Sunghun Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning (CL) benefits the training of sequential recommendation
+models with informative self-supervision signals. Existing solutions apply
+general sequential data augmentation strategies to generate positive pairs and
+encourage their representations to be invariant. However, due to the inherent
+properties of user behavior sequences, some augmentation strategies, such as
+item substitution, can lead to changes in user intent. Learning
+indiscriminately invariant representations for all augmentation strategies
+might be suboptimal. Therefore, we propose Equivariant Contrastive Learning for
+Sequential Recommendation (ECL-SR), which endows SR models with great
+discriminative power, making the learned user behavior representations
+sensitive to invasive augmentations (e.g., item substitution) and insensitive
+to mild augmentations (e.g., featurelevel dropout masking). In detail, we use
+the conditional discriminator to capture differences in behavior due to item
+substitution, which encourages the user behavior encoder to be equivariant to
+invasive augmentations. Comprehensive experiments on four benchmark datasets
+show that the proposed ECL-SR framework achieves competitive performance
+compared to state-of-the-art SR models. The source code is available at
+https://github.com/Tokkiu/ECL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by RecSys 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">91</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantum-noise-limited optical neural networks operating at a few quanta
+  per activation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shi-Yuan Ma, Tianyu Wang, Jérémie Laydevant, Logan G. Wright, Peter L. McMahon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analog physical neural networks, which hold promise for improved energy
+efficiency and speed compared to digital electronic neural networks, are
+nevertheless typically operated in a relatively high-power regime so that the
+signal-to-noise ratio (SNR) is large (>10). What happens if an analog system is
+instead operated in an ultra-low-power regime, in which the behavior of the
+system becomes highly stochastic and the noise is no longer a small
+perturbation on the signal? In this paper, we study this question in the
+setting of optical neural networks operated in the limit where some layers use
+only a single photon to cause a neuron activation. Neuron activations in this
+limit are dominated by quantum noise from the fundamentally probabilistic
+nature of single-photon detection of weak optical signals. We show that it is
+possible to train stochastic optical neural networks to perform deterministic
+image-classification tasks with high accuracy in spite of the extremely high
+noise (SNR ~ 1) by using a training procedure that directly models the
+stochastic behavior of photodetection. We experimentally demonstrated MNIST
+classification with a test accuracy of 98% using an optical neural network with
+a hidden layer operating in the single-photon regime; the optical energy used
+to perform the classification corresponds to 0.008 photons per
+multiply-accumulate (MAC) operation, which is equivalent to 0.003 attojoules of
+optical energy per MAC. Our experiment used >40x fewer photons per inference
+than previous state-of-the-art low-optical-energy demonstrations, to achieve
+the same accuracy of >90%. Our work shows that some extremely stochastic analog
+systems, including those operating in the limit where quantum noise dominates,
+can nevertheless be used as layers in neural networks that deterministically
+perform classification tasks with high accuracy if they are appropriately
+trained.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>55 pages, 27 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semi-Supervised Object Detection in the Open World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15710v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15710v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Garvita Allabadi, Ana Lucic, Peter Pao-Huang, Yu-Xiong Wang, Vikram Adve
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing approaches for semi-supervised object detection assume a fixed set
+of classes present in training and unlabeled datasets, i.e., in-distribution
+(ID) data. The performance of these techniques significantly degrades when
+these techniques are deployed in the open-world, due to the fact that the
+unlabeled and test data may contain objects that were not seen during training,
+i.e., out-of-distribution (OOD) data. The two key questions that we explore in
+this paper are: can we detect these OOD samples and if so, can we learn from
+them? With these considerations in mind, we propose the Open World
+Semi-supervised Detection framework (OWSSD) that effectively detects OOD data
+along with a semi-supervised learning pipeline that learns from both ID and OOD
+data. We introduce an ensemble based OOD detector consisting of lightweight
+auto-encoder networks trained only on ID data. Through extensive evalulation,
+we demonstrate that our method performs competitively against state-of-the-art
+OOD detection algorithms and also significantly boosts the semi-supervised
+learning performance in open-world scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty in Natural Language Generation: From Theory to Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15703v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15703v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joris Baan, Nico Daheim, Evgenia Ilia, Dennis Ulmer, Haau-Sing Li, Raquel Fernández, Barbara Plank, Rico Sennrich, Chrysoula Zerva, Wilker Aziz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances of powerful Language Models have allowed Natural Language
+Generation (NLG) to emerge as an important technology that can not only perform
+traditional tasks like summarisation or translation, but also serve as a
+natural language interface to a variety of applications. As such, it is crucial
+that NLG systems are trustworthy and reliable, for example by indicating when
+they are likely to be wrong; and supporting multiple views, backgrounds and
+writing styles -- reflecting diverse human sub-populations. In this paper, we
+argue that a principled treatment of uncertainty can assist in creating systems
+and evaluation protocols better aligned with these goals. We first present the
+fundamental theory, frameworks and vocabulary required to represent
+uncertainty. We then characterise the main sources of uncertainty in NLG from a
+linguistic perspective, and propose a two-dimensional taxonomy that is more
+informative and faithful than the popular aleatoric/epistemic dichotomy.
+Finally, we move from theory to applications and highlight exciting research
+directions that exploit uncertainty to power decoding, controllable generation,
+self-assessment, selective answering, active learning and more.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Recurrent Event Memories for Streaming Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15694v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15694v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ran Dou, Jose Principe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a new event memory architecture (MemNet) for
+recurrent neural networks, which is universal for different types of time
+series data such as scalar, multivariate or symbolic. Unlike other external
+neural memory architectures, it stores key-value pairs, which separate the
+information for addressing and for content to improve the representation, as in
+the digital archetype. Moreover, the key-value pairs also avoid the compromise
+between memory depth and resolution that applies to memories constructed by the
+model state. One of the MemNet key characteristics is that it requires only
+linear adaptive mapping functions while implementing a nonlinear operation on
+the input data. MemNet architecture can be applied without modifications to
+scalar time series, logic operators on strings, and also to natural language
+processing, providing state-of-the-art results in all application domains such
+as the chaotic time series, the symbolic operation tasks, and the
+question-answering tasks (bAbI). Finally, controlled by five linear layers,
+MemNet requires a much smaller number of training parameters than other
+external memory networks as well as the transformer network. The space
+complexity of MemNet equals a single self-attention layer. It greatly improves
+the efficiency of the attention mechanism and opens the door for IoT
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ODTlearn: A Package for Learning Optimal Decision Trees for Prediction
+  and Prescription 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15691v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15691v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Vossler, Sina Aghaei, Nathan Justin, Nathanael Jo, Andrés Gómez, Phebe Vayanos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ODTLearn is an open-source Python package that provides methods for learning
+optimal decision trees for high-stakes predictive and prescriptive tasks based
+on the mixed-integer optimization (MIO) framework proposed in Aghaei et al.
+(2019) and several of its extensions. The current version of the package
+provides implementations for learning optimal classification trees, optimal
+fair classification trees, optimal classification trees robust to distribution
+shifts, and optimal prescriptive trees from observational data. We have
+designed the package to be easy to maintain and extend as new optimal decision
+tree problem classes, reformulation strategies, and solution algorithms are
+introduced. To this end, the package follows object-oriented design principles
+and supports both commercial (Gurobi) and open source (COIN-OR branch and cut)
+solvers. The package documentation and an extensive user guide can be found at
+https://d3m-research-group.github.io/odtlearn/. Additionally, users can view
+the package source code and submit feature requests and bug reports by visiting
+https://github.com/D3M-Research-Group/odtlearn.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Offline Reinforcement Learning on Real-Robot Hardware <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15690v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15690v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nico Gürtler, Sebastian Blaes, Pavel Kolev, Felix Widmaier, Manuel Wüthrich, Stefan Bauer, Bernhard Schölkopf, Georg Martius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning policies from previously recorded data is a promising direction for
+real-world robotics tasks, as online learning is often infeasible. Dexterous
+manipulation in particular remains an open problem in its general form. The
+combination of offline reinforcement learning with large diverse datasets,
+however, has the potential to lead to a breakthrough in this challenging domain
+analogously to the rapid progress made in supervised learning in recent years.
+To coordinate the efforts of the research community toward tackling this
+problem, we propose a benchmark including: i) a large collection of data for
+offline learning from a dexterous manipulation platform on two tasks, obtained
+with capable RL agents trained in simulation; ii) the option to execute learned
+policies on a real-world robotic system and a simulation for efficient
+debugging. We evaluate prominent open-sourced offline reinforcement learning
+algorithms on the datasets and provide a reproducible experimental setup for
+offline reinforcement learning on real systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Eleventh International Conference on Learning Representations.
+  2022. Published at ICLR 2023. Datasets available at
+  https://github.com/rr-learning/trifinger_rl_datasets</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A supervised hybrid quantum machine learning solution to the emergency
+  escape routing problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15682v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15682v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathan Haboury, Mo Kordzanganeh, Sebastian Schmitt, Ayush Joshi, Igor Tokarev, Lukas Abdallah, Andrii Kurkin, Basil Kyriacou, Alexey Melnikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Managing the response to natural disasters effectively can considerably
+mitigate their devastating impact. This work explores the potential of using
+supervised hybrid quantum machine learning to optimize emergency evacuation
+plans for cars during natural disasters. The study focuses on earthquake
+emergencies and models the problem as a dynamic computational graph where an
+earthquake damages an area of a city. The residents seek to evacuate the city
+by reaching the exit points where traffic congestion occurs. The situation is
+modeled as a shortest-path problem on an uncertain and dynamically evolving
+map. We propose a novel hybrid supervised learning approach and test it on
+hypothetical situations on a concrete city graph. This approach uses a novel
+quantum feature-wise linear modulation (FiLM) neural network parallel to a
+classical FiLM network to imitate Dijkstra's node-wise shortest path algorithm
+on a deterministic dynamic graph. Adding the quantum neural network in parallel
+increases the overall model's expressivity by splitting the dataset's harmonic
+and non-harmonic features between the quantum and classical components. The
+hybrid supervised learning agent is trained on a dataset of Dijkstra's shortest
+paths and can successfully learn the navigation task. The hybrid quantum
+network improves over the purely classical supervised learning approach by 7%
+in accuracy. We show that the quantum part has a significant contribution of
+45.(3)% to the prediction and that the network could be executed on an
+ion-based quantum computer. The results demonstrate the potential of supervised
+hybrid quantum machine learning in improving emergency evacuation planning
+during natural disasters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 9 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Analysis and an Eigen Initializer for Recurrent Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15679v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15679v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ran Dou, Jose Principe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recurrent neural networks, learning long-term dependency is the main
+difficulty due to the vanishing and exploding gradient problem. Many
+researchers are dedicated to solving this issue and they proposed many
+algorithms. Although these algorithms have achieved great success,
+understanding how the information decays remains an open problem. In this
+paper, we study the dynamics of the hidden state in recurrent neural networks.
+We propose a new perspective to analyze the hidden state space based on an
+eigen decomposition of the weight matrix. We start the analysis by linear state
+space model and explain the function of preserving information in activation
+functions. We provide an explanation for long-term dependency based on the
+eigen analysis. We also point out the different behavior of eigenvalues for
+regression tasks and classification tasks. From the observations on
+well-trained recurrent neural networks, we proposed a new initialization method
+for recurrent neural networks, which improves consistently performance. It can
+be applied to vanilla-RNN, LSTM, and GRU. We test on many datasets, such as
+Tomita Grammars, pixel-by-pixel MNIST datasets, and machine translation
+datasets (Multi30k). It outperforms the Xavier initializer and kaiming
+initializer as well as other RNN-only initializers like IRNN and sp-RNN in
+several tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Case Studies of Causal Discovery from IT Monitoring Time Series <span class="chip">UAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15678v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15678v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Aït-Bachir, Charles K. Assaad, Christophe de Bignicourt, Emilie Devijver, Simon Ferreira, Eric Gaussier, Hosein Mohanna, Lei Zan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information technology (IT) systems are vital for modern businesses, handling
+data storage, communication, and process automation. Monitoring these systems
+is crucial for their proper functioning and efficiency, as it allows collecting
+extensive observational time series data for analysis. The interest in causal
+discovery is growing in IT monitoring systems as knowing causal relations
+between different components of the IT system helps in reducing downtime,
+enhancing system performance and identifying root causes of anomalies and
+incidents. It also allows proactive prediction of future issues through
+historical data analysis. Despite its potential benefits, applying causal
+discovery algorithms on IT monitoring data poses challenges, due to the
+complexity of the data. For instance, IT monitoring data often contains
+misaligned time series, sleeping time series, timestamp errors and missing
+values. This paper presents case studies on applying causal discovery
+algorithms to different IT monitoring datasets, highlighting benefits and
+ongoing challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the UAI 2023 Workshop on The History and Development of
+  Search Methods for Causal Structure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial training for tabular data with attack propagation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15677v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15677v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tiago Leon Melo, João Bravo, Marco O. P. Sampaio, Paolo Romano, Hugo Ferreira, João Tiago Ascensão, Pedro Bizarro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial attacks are a major concern in security-centered applications,
+where malicious actors continuously try to mislead Machine Learning (ML) models
+into wrongly classifying fraudulent activity as legitimate, whereas system
+maintainers try to stop them. Adversarially training ML models that are robust
+against such attacks can prevent business losses and reduce the work load of
+system maintainers. In such applications data is often tabular and the space
+available for attackers to manipulate undergoes complex feature engineering
+transformations, to provide useful signals for model training, to a space
+attackers cannot access. Thus, we propose a new form of adversarial training
+where attacks are propagated between the two spaces in the training loop. We
+then test this method empirically on a real world dataset in the domain of
+credit card fraud detection. We show that our method can prevent about 30%
+performance drops under moderate attacks and is essential under very aggressive
+attacks, with a trade-off loss in performance under no attacks smaller than 7%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bayesian Time-Series Classifier for Decoding Simple Visual Stimuli from
+  Intracranial Neural Activity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navid Ziaei, Reza Saadatifard, Ali Yousefi, Behzad Nazari, Sydney S. Cash, Angelique C. Paulk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding how external stimuli are encoded in distributed neural activity
+is of significant interest in clinical and basic neuroscience. To address this
+need, it is essential to develop analytical tools capable of handling limited
+data and the intrinsic stochasticity present in neural data. In this study, we
+propose a straightforward Bayesian time series classifier (BTsC) model that
+tackles these challenges whilst maintaining a high level of interpretability.
+We demonstrate the classification capabilities of this approach by utilizing
+neural data to decode colors in a visual task. The model exhibits consistent
+and reliable average performance of 75.55% on 4 patients' dataset, improving
+upon state-of-the-art machine learning techniques by about 3.0 percent. In
+addition to its high classification accuracy, the proposed BTsC model provides
+interpretable results, making the technique a valuable tool to study neural
+activity in various tasks and categories. The proposed solution can be applied
+to neural data recorded in various tasks, where there is a need for
+interpretable results and accurate classification accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoRe Optimizer: An All-in-One Solution for Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15663v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15663v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Eckhoff, Markus Reiher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The optimization algorithm and its hyperparameters can significantly affect
+the training speed and resulting model accuracy in machine learning
+applications. The wish list for an ideal optimizer includes fast and smooth
+convergence to low error, low computational demand, and general applicability.
+Our recently introduced continual resilient (CoRe) optimizer has shown superior
+performance compared to other state-of-the-art first-order gradient-based
+optimizers for training lifelong machine learning potentials. In this work we
+provide an extensive performance comparison of the CoRe optimizer and nine
+other optimization algorithms including the Adam optimizer and resilient
+backpropagation (RPROP) for diverse machine learning tasks. We analyze the
+influence of different hyperparameters and provide generally applicable values.
+The CoRe optimizer yields best or competitive performance in every investigated
+application, while only one hyperparameter needs to be changed depending on
+mini-batch or batch learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-layer Aggregation as a key to feature-based OOD detection <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15647v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15647v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Lambert, Florence Forbes, Senan Doyle, Michel Dojat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Learning models are easily disturbed by variations in the input images
+that were not observed during the training stage, resulting in unpredictable
+predictions. Detecting such Out-of-Distribution (OOD) images is particularly
+crucial in the context of medical image analysis, where the range of possible
+abnormalities is extremely wide. Recently, a new category of methods has
+emerged, based on the analysis of the intermediate features of a trained model.
+These methods can be divided into 2 groups: single-layer methods that consider
+the feature map obtained at a fixed, carefully chosen layer, and multi-layer
+methods that consider the ensemble of the feature maps generated by the model.
+While promising, a proper comparison of these algorithms is still lacking. In
+this work, we compared various feature-based OOD detection methods on a large
+spectra of OOD (20 types), representing approximately 7800 3D MRIs. Our
+experiments shed the light on two phenomenons. First, multi-layer methods
+consistently outperform single-layer approaches, which tend to have
+inconsistent behaviour depending on the type of anomaly. Second, the OOD
+detection performance highly depends on the architecture of the underlying
+neural network.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the Workshop on Uncertainty for Safe
+  Utilization of Machine Learning in Medical Imaging (UNSURE) at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scale-aware Test-time Click Adaptation for Pulmonary Nodule and Mass
+  Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15645v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15645v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Li, Jiancheng Yang, Yongchao Xu, Li Zhang, Wenhui Dong, Bo Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pulmonary nodules and masses are crucial imaging features in lung cancer
+screening that require careful management in clinical diagnosis. Despite the
+success of deep learning-based medical image segmentation, the robust
+performance on various sizes of lesions of nodule and mass is still
+challenging. In this paper, we propose a multi-scale neural network with
+scale-aware test-time adaptation to address this challenge. Specifically, we
+introduce an adaptive Scale-aware Test-time Click Adaptation method based on
+effortlessly obtainable lesion clicks as test-time cues to enhance segmentation
+performance, particularly for large lesions. The proposed method can be
+seamlessly integrated into existing networks. Extensive experiments on both
+open-source and in-house datasets consistently demonstrate the effectiveness of
+the proposed method over some CNN and Transformer-based segmentation methods.
+Our code is available at https://github.com/SplinterLi/SaTTCA
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 3 figures, MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Data Generation in Vision-and-Language Navigation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15644v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15644v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zun Wang, Jialu Li, Yicong Hong, Yi Wang, Qi Wu, Mohit Bansal, Stephen Gould, Hao Tan, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research in language-guided visual navigation has demonstrated a
+significant demand for the diversity of traversable environments and the
+quantity of supervision for training generalizable agents. To tackle the common
+data scarcity issue in existing vision-and-language navigation datasets, we
+propose an effective paradigm for generating large-scale data for learning,
+which applies 1200+ photo-realistic environments from HM3D and Gibson datasets
+and synthesizes 4.9 million instruction trajectory pairs using fully-accessible
+resources on the web. Importantly, we investigate the influence of each
+component in this paradigm on the agent's performance and study how to
+adequately apply the augmented data to pre-train and fine-tune an agent. Thanks
+to our large-scale dataset, the performance of an existing agent can be pushed
+up (+11% absolute with regard to previous SoTA) to a significantly new best of
+80% single-run success rate on the R2R test split by simple imitation learning.
+The long-lasting generalization gap between navigating in seen and unseen
+environments is also reduced to less than 1% (versus 8% in the previous best
+method). Moreover, our paradigm also facilitates different models to achieve
+new state-of-the-art navigation results on CVDN, REVERIE, and R2R in continuous
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TriadNet: Sampling-free predictive intervals for lesional volume in 3D
+  brain MR images <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Lambert, Florence Forbes, Senan Doyle, Michel Dojat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The volume of a brain lesion (e.g. infarct or tumor) is a powerful indicator
+of patient prognosis and can be used to guide the therapeutic strategy.
+Lesional volume estimation is usually performed by segmentation with deep
+convolutional neural networks (CNN), currently the state-of-the-art approach.
+However, to date, few work has been done to equip volume segmentation tools
+with adequate quantitative predictive intervals, which can hinder their
+usefulness and acceptation in clinical practice. In this work, we propose
+TriadNet, a segmentation approach relying on a multi-head CNN architecture,
+which provides both the lesion volumes and the associated predictive intervals
+simultaneously, in less than a second. We demonstrate its superiority over
+other solutions on BraTS 2021, a large-scale MRI glioblastoma image database.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the Workshop on Uncertainty for Safe
+  Utilization of Machine Learning in Medical Imaging (UNSURE) at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Analysis of Machine Learning Methods for Lane Change
+  Intention Recognition Using Vehicle Trajectory Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15625v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15625v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renteng Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately detecting and predicting lane change (LC)processes can help
+autonomous vehicles better understand their surrounding environment, recognize
+potential safety hazards, and improve traffic safety. This paper focuses on LC
+processes and compares different machine learning methods' performance to
+recognize LC intention from high-dimensionality time series data. To validate
+the performance of the proposed models, a total number of 1023 vehicle
+trajectories is extracted from the CitySim dataset. For LC intention
+recognition issues, the results indicate that with ninety-eight percent of
+classification accuracy, ensemble methods reduce the impact of Type II and Type
+III classification errors. Without sacrificing recognition accuracy, the
+LightGBM demonstrates a sixfold improvement in model training efficiency than
+the XGBoost algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2304.13732</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shrink-Perturb Improves Architecture Mixing during Population Based
+  Training for Neural Architecture Search <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15621v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15621v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Chebykin, Arkadiy Dushatskiy, Tanja Alderliesten, Peter A. N. Bosman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we show that simultaneously training and mixing neural networks
+is a promising way to conduct Neural Architecture Search (NAS). For
+hyperparameter optimization, reusing the partially trained weights allows for
+efficient search, as was previously demonstrated by the Population Based
+Training (PBT) algorithm. We propose PBT-NAS, an adaptation of PBT to NAS where
+architectures are improved during training by replacing poorly-performing
+networks in a population with the result of mixing well-performing ones and
+inheriting the weights using the shrink-perturb technique. After PBT-NAS
+terminates, the created networks can be directly used without retraining.
+PBT-NAS is highly parallelizable and effective: on challenging tasks (image
+generation and reinforcement learning) PBT-NAS achieves superior performance
+compared to baselines (random search and mutation-based PBT).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures. Accepted at ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Distortion-free Watermarks for Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15593v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15593v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohith Kuditipudi, John Thickstun, Tatsunori Hashimoto, Percy Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a methodology for planting watermarks in text from an
+autoregressive language model that are robust to perturbations without changing
+the distribution over text up to a certain maximum generation budget. We
+generate watermarked text by mapping a sequence of random numbers -- which we
+compute using a randomized watermark key -- to a sample from the language
+model. To detect watermarked text, any party who knows the key can align the
+text to the random number sequence. We instantiate our watermark methodology
+with two sampling schemes: inverse transform sampling and exponential minimum
+sampling. We apply these watermarks to three language models -- OPT-1.3B,
+LLaMA-7B and Alpaca-7B -- to experimentally validate their statistical power
+and robustness to various paraphrasing attacks. Notably, for both the OPT-1.3B
+and LLaMA-7B models, we find we can reliably detect watermarked text ($p \leq
+0.01$) from $35$ tokens even after corrupting between $40$-$50$\% of the tokens
+via random edits (i.e., substitutions, insertions or deletions). For the
+Alpaca-7B model, we conduct a case study on the feasibility of watermarking
+responses to typical user instructions. Due to the lower entropy of the
+responses, detection is more difficult: around $25\%$ of the responses -- whose
+median length is around $100$ tokens -- are detectable with $p \leq 0.01$, and
+the watermark is also less robust to certain automated paraphrasing attacks we
+implement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic algorithms for k-center on graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15557v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15557v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emilio Cruciani, Sebastian Forster, Gramoz Goranci, Yasamin Nazari, Antonis Skarlatos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we give the first efficient algorithms for the $k$-center
+problem on dynamic graphs undergoing edge updates. In this problem, the goal is
+to partition the input into $k$ sets by choosing $k$ centers such that the
+maximum distance from any data point to the closest center is minimized. It is
+known that it is NP-hard to get a better than $2$ approximation for this
+problem.
+  While in many applications the input may naturally be modeled as a graph, all
+prior works on $k$-center problem in dynamic settings are on metrics. In this
+paper, we give a deterministic decremental $(2+\epsilon)$-approximation
+algorithm and a randomized incremental $(4+\epsilon)$-approximation algorithm,
+both with amortized update time $kn^{o(1)}$ for weighted graphs. Moreover, we
+show a reduction that leads to a fully dynamic $(2+\epsilon)$-approximation
+algorithm for the $k$-center problem, with worst-case update time that is
+within a factor $k$ of the state-of-the-art upper bound for maintaining
+$(1+\epsilon)$-approximate single-source distances in graphs. Matching this
+bound is a natural goalpost because the approximate distances of each vertex to
+its center can be used to maintain a $(2+\epsilon)$-approximation of the graph
+diameter and the fastest known algorithms for such a diameter approximation
+also rely on maintaining approximate single-source distances.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Trade-off Between Efficiency and Precision of Neural Abstraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15546v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15546v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alec Edwards, Mirco Giacobbe, Alessandro Abate
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural abstractions have been recently introduced as formal approximations of
+complex, nonlinear dynamical models. They comprise a neural ODE and a certified
+upper bound on the error between the abstract neural network and the concrete
+dynamical model. So far neural abstractions have exclusively been obtained as
+neural networks consisting entirely of $ReLU$ activation functions, resulting
+in neural ODE models that have piecewise affine dynamics, and which can be
+equivalently interpreted as linear hybrid automata. In this work, we observe
+that the utility of an abstraction depends on its use: some scenarios might
+require coarse abstractions that are easier to analyse, whereas others might
+require more complex, refined abstractions. We therefore consider neural
+abstractions of alternative shapes, namely either piecewise constant or
+nonlinear non-polynomial (specifically, obtained via sigmoidal activations). We
+employ formal inductive synthesis procedures to generate neural abstractions
+that result in dynamical models with these semantics. Empirically, we
+demonstrate the trade-off that these different neural abstraction templates
+have vis-a-vis their precision and synthesis time, as well as the time required
+for their safety verification (done via reachability computation). We improve
+existing synthesis techniques to enable abstraction of higher-dimensional
+models, and additionally discuss the abstraction of complex neural ODEs to
+improve the efficiency of reachability analysis for these models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at QEST 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Backdoor Defense with Non-Adversarial Backdoor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15539v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15539v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Liu, Alberto Sangiovanni-Vincentelli, Xiangyu Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) are vulnerable to backdoor attack, which does not
+affect the network's performance on clean data but would manipulate the network
+behavior once a trigger pattern is added. Existing defense methods have greatly
+reduced attack success rate, but their prediction accuracy on clean data still
+lags behind a clean model by a large margin. Inspired by the stealthiness and
+effectiveness of backdoor attack, we propose a simple but highly effective
+defense framework which injects non-adversarial backdoors targeting poisoned
+samples. Following the general steps in backdoor attack, we detect a small set
+of suspected samples and then apply a poisoning strategy to them. The
+non-adversarial backdoor, once triggered, suppresses the attacker's backdoor on
+poisoned data, but has limited influence on clean data. The defense can be
+carried out during data preprocessing, without any modification to the standard
+end-to-end training pipeline. We conduct extensive experiments on multiple
+benchmarks with different architectures and representative attacks. Results
+demonstrate that our method achieves state-of-the-art defense effectiveness
+with by far the lowest performance drop on clean data. Considering the
+surprising defense ability displayed by our framework, we call for more
+attention to utilizing backdoor for backdoor defense. Code is available at
+https://github.com/damianliumin/non-adversarial_backdoor.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Applicability of Federated Learning to Official Statistics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15503v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15503v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Stock, Oliver Hauke, Julius Weißmann, Hannes Federrath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work investigates the potential of Federated Learning (FL) for official
+statistics and shows how well the performance of FL models can keep up with
+centralized learning methods. At the same time, its utilization can safeguard
+the privacy of data holders, thus facilitating access to a broader range of
+data and ultimately enhancing official statistics. By simulating three
+different use cases, important insights on the applicability of the technology
+are gained. The use cases are based on a medical insurance data set, a fine
+dust pollution data set and a mobile radio coverage data set - all of which are
+from domains close to official statistics. We provide a detailed analysis of
+the results, including a comparison of centralized and FL algorithm
+performances for each simulation. In all three use cases, we were able to train
+models via FL which reach a performance very close to the centralized model
+benchmarks. Our key observations and their implications for transferring the
+simulations into practice are summarized. We arrive at the conclusion that FL
+has the potential to emerge as a pivotal technology in future use cases of
+official statistics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From continuous-time formulations to discretization schemes: tensor
+  trains and robust regression for BSDEs and parabolic PDEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenz Richter, Leon Sallandt, Nikolas Nüsken
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The numerical approximation of partial differential equations (PDEs) poses
+formidable challenges in high dimensions since classical grid-based methods
+suffer from the so-called curse of dimensionality. Recent attempts rely on a
+combination of Monte Carlo methods and variational formulations, using neural
+networks for function approximation. Extending previous work (Richter et al.,
+2021), we argue that tensor trains provide an appealing framework for parabolic
+PDEs: The combination of reformulations in terms of backward stochastic
+differential equations and regression-type methods holds the promise of
+leveraging latent low-rank structures, enabling both compression and efficient
+computation. Emphasizing a continuous-time viewpoint, we develop iterative
+schemes, which differ in terms of computational efficiency and robustness. We
+demonstrate both theoretically and numerically that our methods can achieve a
+favorable trade-off between accuracy and computational efficiency. While
+previous methods have been either accurate or fast, we have identified a novel
+numerical strategy that can often combine both of these aspects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FeedbackLogs: Recording and Incorporating Stakeholder Feedback into
+  Machine Learning Pipelines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Barker, Emma Kallina, Dhananjay Ashok, Katherine M. Collins, Ashley Casovan, Adrian Weller, Ameet Talwalkar, Valerie Chen, Umang Bhatt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Even though machine learning (ML) pipelines affect an increasing array of
+stakeholders, there is little work on how input from stakeholders is recorded
+and incorporated. We propose FeedbackLogs, addenda to existing documentation of
+ML pipelines, to track the input of multiple stakeholders. Each log records
+important details about the feedback collection process, the feedback itself,
+and how the feedback is used to update the ML pipeline. In this paper, we
+introduce and formalise a process for collecting a FeedbackLog. We also provide
+concrete use cases where FeedbackLogs can be employed as evidence for
+algorithmic auditing and as a tool to record updates based on stakeholder
+feedback.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LUCID-GAN: Conditional Generative Models to Locate Unfairness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15466v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15466v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andres Algaba, Carmen Mazijn, Carina Prunkl, Jan Danckaert, Vincent Ginis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most group fairness notions detect unethical biases by computing statistical
+parity metrics on a model's output. However, this approach suffers from several
+shortcomings, such as philosophical disagreement, mutual incompatibility, and
+lack of interpretability. These shortcomings have spurred the research on
+complementary bias detection methods that offer additional transparency into
+the sources of discrimination and are agnostic towards an a priori decision on
+the definition of fairness and choice of protected features. A recent proposal
+in this direction is LUCID (Locating Unfairness through Canonical Inverse
+Design), where canonical sets are generated by performing gradient descent on
+the input space, revealing a model's desired input given a preferred output.
+This information about the model's mechanisms, i.e., which feature values are
+essential to obtain specific outputs, allows exposing potential unethical
+biases in its internal logic. Here, we present LUCID-GAN, which generates
+canonical inputs via a conditional generative model instead of gradient-based
+inverse design. LUCID-GAN has several benefits, including that it applies to
+non-differentiable models, ensures that canonical sets consist of realistic
+inputs, and allows to assess proxy and intersectional discrimination. We
+empirically evaluate LUCID-GAN on the UCI Adult and COMPAS data sets and show
+that it allows for detecting unethical biases in black-box models without
+requiring access to the training data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 6 figures, 1st World Conference on eXplainable Artificial
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Worrisome Properties of Neural Network Controllers and Their Symbolic
+  Representations <span class="chip">ECAI23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15456v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15456v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacek Cyranka, Kevin E M Church, Jean-Philippe Lessard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We raise concerns about controllers' robustness in simple reinforcement
+learning benchmark problems. We focus on neural network controllers and their
+low neuron and symbolic abstractions. A typical controller reaching high mean
+return values still generates an abundance of persistent low-return solutions,
+which is a highly undesirable property, easily exploitable by an adversary. We
+find that the simpler controllers admit more persistent bad solutions. We
+provide an algorithm for a systematic robustness study and prove existence of
+persistent solutions and, in some cases, periodic orbits, using a
+computer-assisted proof methodology.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to ECAI23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Autonomous Payload Thermal Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15438v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15438v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejandro D. Mousist
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In small satellites there is less room for heat control equipment, scientific
+instruments, and electronic components. Furthermore, the near proximity of the
+electronics makes power dissipation difficult, with the risk of not being able
+to control the temperature appropriately, reducing component lifetime and
+mission performance. To address this challenge, taking advantage of the advent
+of increasing intelligence on board satellites, a deep reinforcement learning
+based framework that uses Soft Actor-Critic algorithm is proposed for learning
+the thermal control policy onboard. The framework is evaluated both in a naive
+simulated environment and in a real space edge processing computer that will be
+shipped in the future IMAGIN-e mission and hosted in the ISS. The experiment
+results show that the proposed framework is able to learn to control the
+payload processing power to maintain the temperature under operational ranges,
+complementing traditional thermal control systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improvable Gap Balancing for Multi-Task Learning <span class="chip">UAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanqi Dai, Nanyi Fei, Zhiwu Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In multi-task learning (MTL), gradient balancing has recently attracted more
+research interest than loss balancing since it often leads to better
+performance. However, loss balancing is much more efficient than gradient
+balancing, and thus it is still worth further exploration in MTL. Note that
+prior studies typically ignore that there exist varying improvable gaps across
+multiple tasks, where the improvable gap per task is defined as the distance
+between the current training progress and desired final training progress.
+Therefore, after loss balancing, the performance imbalance still arises in many
+cases. In this paper, following the loss balancing framework, we propose two
+novel improvable gap balancing (IGB) algorithms for MTL: one takes a simple
+heuristic, and the other (for the first time) deploys deep reinforcement
+learning for MTL. Particularly, instead of directly balancing the losses in
+MTL, both algorithms choose to dynamically assign task weights for improvable
+gap balancing. Moreover, we combine IGB and gradient balancing to show the
+complementarity between the two types of algorithms. Extensive experiments on
+two benchmark datasets demonstrate that our IGB algorithms lead to the best
+results in MTL via loss balancing and achieve further improvements when
+combined with gradient balancing. Code is available at
+https://github.com/YanqiDai/IGB4MTL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for the 39th Conference on Uncertainty in Artificial
+  Intelligence (UAI 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implicit neural representation for change detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Naylor, Diego Di Carlo, Arianna Traviglia, Makoto Yamada, Marco Fiorucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting changes that occurred in a pair of 3D airborne LiDAR point clouds,
+acquired at two different times over the same geographical area, is a
+challenging task because of unmatching spatial supports and acquisition system
+noise. Most recent attempts to detect changes on point clouds are based on
+supervised methods, which require large labelled data unavailable in real-world
+applications. To address these issues, we propose an unsupervised approach that
+comprises two components: Neural Field (NF) for continuous shape reconstruction
+and a Gaussian Mixture Model for categorising changes. NF offer a grid-agnostic
+representation to encode bi-temporal point clouds with unmatched spatial
+support that can be regularised to increase high-frequency details and reduce
+noise. The reconstructions at each timestamp are compared at arbitrary spatial
+scales, leading to a significant increase in detection capabilities. We apply
+our method to a benchmark dataset of simulated LiDAR point clouds for urban
+sprawling. The dataset offers different challenging scenarios with different
+resolutions, input modalities and noise levels, allowing a multi-scenario
+comparison of our method with the current state-of-the-art. We boast the
+previous methods on this dataset by a 10% margin in intersection over union
+metric. In addition, we apply our methods to a real-world scenario to identify
+illegal excavation (looting) of archaeological sites and confirm that they
+match findings from field experts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Main article is 10 pages + 3 pages of supplementary. Conference style
+  paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Generative Models, Synthetic Tabular Data, and Differential
+  Privacy: An <span class="highlight-title">Overview</span> and Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15424v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15424v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Conor Hassan, Robert Salomone, Kerrie Mengersen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article provides a comprehensive synthesis of the recent developments in
+synthetic data generation via deep generative models, focusing on tabular
+datasets. We specifically outline the importance of synthetic data generation
+in the context of privacy-sensitive data. Additionally, we highlight the
+advantages of using deep generative models over other methods and provide a
+detailed explanation of the underlying concepts, including unsupervised
+learning, neural networks, and generative models. The paper covers the
+challenges and considerations involved in using deep generative models for
+tabular datasets, such as data normalization, privacy concerns, and model
+evaluation. This review provides a valuable resource for researchers and
+practitioners interested in synthetic data generation and its applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is One Epoch All You Need For Multi-Fidelity Hyperparameter
+  Optimization? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15422v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15422v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Romain Egele, Isabelle Guyon, Yixuan Sun, Prasanna Balaprakash
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperparameter optimization (HPO) is crucial for fine-tuning machine learning
+models but can be computationally expensive. To reduce costs, Multi-fidelity
+HPO (MF-HPO) leverages intermediate accuracy levels in the learning process and
+discards low-performing models early on. We compared various representative
+MF-HPO methods against a simple baseline on classical benchmark data. The
+baseline involved discarding all models except the Top-K after training for
+only one epoch, followed by further training to select the best model.
+Surprisingly, this baseline achieved similar results to its counterparts, while
+requiring an order of magnitude less computation. Upon analyzing the learning
+curves of the benchmark data, we observed a few dominant learning curves, which
+explained the success of our baseline. This suggests that researchers should
+(1) always use the suggested baseline in benchmarks and (2) broaden the
+diversity of MF-HPO benchmarks to include more complex cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, with extended appendices</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Initial Screening Order Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jose M. Alvarez, Salvatore Ruggieri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we present the initial screening order problem, a crucial step
+within candidate screening. It involves a human-like screener with an objective
+to find the first k suitable candidates rather than the best k suitable
+candidates in a candidate pool given an initial screening order. The initial
+screening order represents the way in which the human-like screener arranges
+the candidate pool prior to screening. The choice of initial screening order
+has considerable effects on the selected set of k candidates. We prove that
+under an unbalanced candidate pool (e.g., having more male than female
+candidates), the human-like screener can suffer from uneven efforts that hinder
+its decision-making over the protected, under-represented group relative to the
+non-protected, over-represented group. Other fairness results are proven under
+the human-like screener. This research is based on a collaboration with a large
+company to better understand its hiring process for potential automation. Our
+main contribution is the formalization of the initial screening order problem
+which, we argue, opens the path for future extensions of the current works on
+ranking algorithms, fairness, and automation for screening procedures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Noisy Interpolation Learning with Shallow Univariate ReLU Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15396v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15396v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nirmit Joshi, Gal Vardi, Nathan Srebro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the asymptotic overfitting behavior of interpolation with minimum
+norm ($\ell_2$ of the weights) two-layer ReLU networks for noisy univariate
+regression. We show that overfitting is tempered for the $L_1$ loss, and any
+$L_p$ loss for $p<2$, but catastrophic for $p\geq 2$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Does Full Waveform Inversion Benefit from Big Data? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15388v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15388v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Jin, Yinan Feng, Shihang Feng, Hanchen Wang, Yinpeng Chen, Benjamin Consolvo, Zicheng Liu, Youzuo Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the impact of big data on deep learning models for
+full waveform inversion (FWI). While it is well known that big data can boost
+the performance of deep learning models in many tasks, its effectiveness has
+not been validated for FWI. To address this gap, we present an empirical study
+that investigates how deep learning models in FWI behave when trained on
+OpenFWI, a collection of large-scale, multi-structural datasets published
+recently. Particularly, we train and evaluate the FWI models on a combination
+of 10 2D subsets in OpenFWI that contain 470K data pairs in total. Our
+experiments demonstrate that larger datasets lead to better performance and
+generalization of deep learning models for FWI. We further demonstrate that
+model capacity needs to scale in accordance with data size for optimal
+improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Co-attention Graph Pooling for Efficient Pairwise Graph Interaction
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15377v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15377v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhyun Lee, Bumsoo Kim, Minji Jeon, Jaewoo Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have proven to be effective in processing and
+learning from graph-structured data. However, previous works mainly focused on
+understanding single graph inputs while many real-world applications require
+pair-wise analysis for graph-structured data (e.g., scene graph matching, code
+searching, and drug-drug interaction prediction). To this end, recent works
+have shifted their focus to learning the interaction between pairs of graphs.
+Despite their improved performance, these works were still limited in that the
+interactions were considered at the node-level, resulting in high computational
+costs and suboptimal performance. To address this issue, we propose a novel and
+efficient graph-level approach for extracting interaction representations using
+co-attention in graph pooling. Our method, Co-Attention Graph Pooling
+(CAGPool), exhibits competitive performance relative to existing methods in
+both classification and regression tasks using real-world datasets, while
+maintaining lower computational complexity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at IEEE Access</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conflict-free joint decision by lag and zero-lag synchronization in
+  laser network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15373v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15373v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hisako Ito, Takatomo Mihana, Ryoichi Horisaki, Makoto Naruse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the end of Moore's Law and the increasing demand for computing, photonic
+accelerators are garnering considerable attention. This is due to the physical
+characteristics of light, such as high bandwidth and multiplicity, and the
+various synchronization phenomena that emerge in the realm of laser physics.
+These factors come into play as computer performance approaches its limits. In
+this study, we explore the application of a laser network, acting as a photonic
+accelerator, to the competitive multi-armed bandit problem. In this context,
+conflict avoidance is key to maximizing environmental rewards. We
+experimentally demonstrate cooperative decision-making using zero-lag and lag
+synchronization within a network of four semiconductor lasers. Lag
+synchronization of chaos realizes effective decision-making and zero-delay
+synchronization is responsible for the realization of the collision avoidance
+function. We experimentally verified a low collision rate and high reward in a
+fundamental 2-player, 2-slot scenario, and showed the scalability of this
+system. This system architecture opens up new possibilities for intelligent
+functionalities in laser dynamics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Transparent Sequence Models with Model-Based Tree Markov Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15367v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15367v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chan Hsu, Wei-Chun Huang, Jun-Ting Wu, Chih-Yuan Li, Yihuang Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we address the interpretability issue in complex, black-box
+Machine Learning models applied to sequence data. We introduce the Model-Based
+tree Hidden Semi-Markov Model (MOB-HSMM), an inherently interpretable model
+aimed at detecting high mortality risk events and discovering hidden patterns
+associated with the mortality risk in Intensive Care Units (ICU). This model
+leverages knowledge distilled from Deep Neural Networks (DNN) to enhance
+predictive performance while offering clear explanations. Our experimental
+results indicate the improved performance of Model-Based trees (MOB trees) via
+employing LSTM for learning sequential patterns, which are then transferred to
+MOB trees. Integrating MOB trees with the Hidden Semi-Markov Model (HSMM) in
+the MOB-HSMM enables uncovering potential and explainable sequences using
+available information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Confident Feature Ranking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bitya Neuhof, Yuval Benjamini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpretation of feature importance values often relies on the relative
+order of the features rather than on the value itself, referred to as ranking.
+However, the order may be unstable due to the small sample sizes used in
+calculating the importance values. We propose that post-hoc importance methods
+produce a ranking and simultaneous confident intervals for the rankings. Based
+on pairwise comparisons of the feature importance values, our method is
+guaranteed to include the ``true'' (infinite sample) ranking with high
+probability and allows for selecting top-k sets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Med-HALT: Medical Domain Hallucination Test for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Logesh Kumar Umapathi, Ankit Pal, Malaikannan Sankarasubbu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research paper focuses on the challenges posed by hallucinations in
+large language models (LLMs), particularly in the context of the medical
+domain. Hallucination, wherein these models generate plausible yet unverified
+or incorrect information, can have serious consequences in healthcare
+applications. We propose a new benchmark and dataset, Med-HALT (Medical Domain
+Hallucination Test), designed specifically to evaluate and reduce
+hallucinations. Med-HALT provides a diverse multinational dataset derived from
+medical examinations across various countries and includes multiple innovative
+testing modalities. Med-HALT includes two categories of tests reasoning and
+memory-based hallucination tests, designed to assess LLMs's problem-solving and
+information retrieval abilities.
+  Our study evaluated leading LLMs, including Text Davinci, GPT-3.5, LlaMa-2,
+MPT, and Falcon, revealing significant differences in their performance. The
+paper provides detailed insights into the dataset, promoting transparency and
+reproducibility. Through this work, we aim to contribute to the development of
+safer and more reliable language models in healthcare. Our benchmark can be
+found at medhalt.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Radon Signed Cumulative Distribution Transform and its applications
+  in classification of Signed Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Gong, Shiying Li, Naqib Sad Pathan, Mohammad Shifat-E-Rabbi, Gustavo K. Rohde, Abu Hasnat Mohammad Rubaiyat, Sumati Thareja
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Here we describe a new image representation technique based on the
+mathematics of transport and optimal transport. The method relies on the
+combination of the well-known Radon transform for images and a recent signal
+representation method called the Signed Cumulative Distribution Transform. The
+newly proposed method generalizes previous transport-related image
+representation methods to arbitrary functions (images), and thus can be used in
+more applications. We describe the new transform, and some of its mathematical
+properties and demonstrate its ability to partition image classes with real and
+simulated data. In comparison to existing transport transform methods, as well
+as deep learning-based classification methods, the new transform more
+accurately represents the information content of signed images, and thus can be
+used to obtain higher classification accuracies. The implementation of the
+proposed method in Python language is integrated as a part of the software
+package PyTransKit, available on Github.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Staging E-Commerce Products for Online Advertising using Retrieval
+  Assisted Image Generation <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueh-Ning Ku, Mikhail Kuznetsov, Shaunak Mishra, Paloma de Juan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online ads showing e-commerce products typically rely on the product images
+in a catalog sent to the advertising platform by an e-commerce platform. In the
+broader ads industry such ads are called dynamic product ads (DPA). It is
+common for DPA catalogs to be in the scale of millions (corresponding to the
+scale of products which can be bought from the e-commerce platform). However,
+not all product images in the catalog may be appealing when directly
+re-purposed as an ad image, and this may lead to lower click-through rates
+(CTRs). In particular, products just placed against a solid background may not
+be as enticing and realistic as a product staged in a natural environment. To
+address such shortcomings of DPA images at scale, we propose a generative
+adversarial network (GAN) based approach to generate staged backgrounds for
+un-staged product images. Generating the entire staged background is a
+challenging task susceptible to hallucinations. To get around this, we
+introduce a simpler approach called copy-paste staging using retrieval assisted
+GANs. In copy paste staging, we first retrieve (from the catalog) staged
+products similar to the un-staged input product, and then copy-paste the
+background of the retrieved product in the input image. A GAN based in-painting
+model is used to fill the holes left after this copy-paste operation. We show
+the efficacy of our copy-paste staging method via offline metrics, and human
+evaluation. In addition, we show how our staging approach can enable animations
+of moving products leading to a video ad from a product image.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in AdKDD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Partial observations, coarse graining and equivariance in Koopman
+  operator theory for large-scale dynamical systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Peitz, Hans Harder, Feliks Nüske, Friedrich Philipp, Manuel Schaller, Karl Worthmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Koopman operator has become an essential tool for data-driven analysis,
+prediction and control of complex systems, the main reason being the enormous
+potential of identifying linear function space representations of nonlinear
+dynamics from measurements. Until now, the situation where for large-scale
+systems, we (i) only have access to partial observations (i.e., measurements,
+as is very common for experimental data) or (ii) deliberately perform coarse
+graining (for efficiency reasons) has not been treated to its full extent. In
+this paper, we address the pitfall associated with this situation, that the
+classical EDMD algorithm does not automatically provide a Koopman operator
+approximation for the underlying system if we do not carefully select the
+number of observables. Moreover, we show that symmetries in the system dynamics
+can be carried over to the Koopman operator, which allows us to massively
+increase the model efficiency. We also briefly draw a connection to domain
+decomposition techniques for partial differential equations and present
+numerical evidence using the Kuramoto--Sivashinsky equation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Visual Sim-to-Real Transfer for Robotic Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ricardo Garcia, Robin Strudel, Shizhe Chen, Etienne Arlaud, Ivan Laptev, Cordelia Schmid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning visuomotor policies in simulation is much safer and cheaper than in
+the real world. However, due to discrepancies between the simulated and real
+data, simulator-trained policies often fail when transferred to real robots.
+One common approach to bridge the visual sim-to-real domain gap is domain
+randomization (DR). While previous work mainly evaluates DR for disembodied
+tasks, such as pose estimation and object detection, here we systematically
+explore visual domain randomization methods and benchmark them on a rich set of
+challenging robotic manipulation tasks. In particular, we propose an off-line
+proxy task of cube localization to select DR parameters for texture
+randomization, lighting randomization, variations of object colors and camera
+parameters. Notably, we demonstrate that DR parameters have similar impact on
+our off-line proxy task and on-line policies. We, hence, use off-line optimized
+DR parameters to train visuomotor policies in simulation and directly apply
+such policies to a real robot. Our approach achieves 93% success rate on
+average when tested on a diverse set of challenging manipulation tasks.
+Moreover, we evaluate the robustness of policies to visual variations in real
+scenes and show that our simulator-trained policies outperform policies learned
+using real but limited data. Code, simulation environment, real robot datasets
+and trained models are available at
+https://www.di.ens.fr/willow/research/robust_s2r/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffKendall: A Novel Approach for Few-Shot Learning with Differentiable
+  Kendall's Rank Correlation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaipeng Zheng, Huishuai Zhang, Weiran Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot learning aims to adapt models trained on the base dataset to novel
+tasks where the categories are not seen by the model before. This often leads
+to a relatively uniform distribution of feature values across channels on novel
+classes, posing challenges in determining channel importance for novel tasks.
+Standard few-shot learning methods employ geometric similarity metrics such as
+cosine similarity and negative Euclidean distance to gauge the semantic
+relatedness between two features. However, features with high geometric
+similarities may carry distinct semantics, especially in the context of
+few-shot learning. In this paper, we demonstrate that the importance ranking of
+feature channels is a more reliable indicator for few-shot learning than
+geometric similarity metrics. We observe that replacing the geometric
+similarity metric with Kendall's rank correlation only during inference is able
+to improve the performance of few-shot learning across a wide range of datasets
+with different domains. Furthermore, we propose a carefully designed
+differentiable loss for meta-training to address the non-differentiability
+issue of Kendall's rank correlation. Extensive experiments demonstrate that the
+proposed rank-correlation-based approach substantially enhances few-shot
+learning performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Differential Evolution Algorithm based Hyper-Parameters Selection of
+  <span class="highlight-title">Transformer</span> Neural Network Model for Load Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15299v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15299v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anuvab Sen, Arul Rhik Mazumder, Udayon Sen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate load forecasting plays a vital role in numerous sectors, but
+accurately capturing the complex dynamics of dynamic power systems remains a
+challenge for traditional statistical models. For these reasons, time-series
+models (ARIMA) and deep-learning models (ANN, LSTM, GRU, etc.) are commonly
+deployed and often experience higher success. In this paper, we analyze the
+efficacy of the recently developed Transformer-based Neural Network model in
+Load forecasting. Transformer models have the potential to improve Load
+forecasting because of their ability to learn long-range dependencies derived
+from their Attention Mechanism. We apply several metaheuristics namely
+Differential Evolution to find the optimal hyperparameters of the
+Transformer-based Neural Network to produce accurate forecasts. Differential
+Evolution provides scalable, robust, global solutions to non-differentiable,
+multi-objective, or constrained optimization problems. Our work compares the
+proposed Transformer based Neural Network model integrated with different
+metaheuristic algorithms by their performance in Load forecasting based on
+numerical metrics such as Mean Squared Error (MSE) and Mean Absolute Percentage
+Error (MAPE). Our findings demonstrate the potential of metaheuristic-enhanced
+Transformer-based Neural Network models in Load forecasting accuracy and
+provide optimal hyperparameters for each model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 Pages, 6 Figures, 2 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Nonlinear Projections for Reduced-Order Modeling of Dynamical
+  Systems using Constrained Autoencoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15288v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15288v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel E. Otto, Gregory R. Macchio, Clarence W. Rowley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently developed reduced-order modeling techniques aim to approximate
+nonlinear dynamical systems on low-dimensional manifolds learned from data.
+This is an effective approach for modeling dynamics in a post-transient regime
+where the effects of initial conditions and other disturbances have decayed.
+However, modeling transient dynamics near an underlying manifold, as needed for
+real-time control and forecasting applications, is complicated by the effects
+of fast dynamics and nonnormal sensitivity mechanisms. To begin to address
+these issues, we introduce a parametric class of nonlinear projections
+described by constrained autoencoder neural networks in which both the manifold
+and the projection fibers are learned from data. Our architecture uses
+invertible activation functions and biorthogonal weight matrices to ensure that
+the encoder is a left inverse of the decoder. We also introduce new
+dynamics-aware cost functions that promote learning of oblique projection
+fibers that account for fast dynamics and nonnormality. To demonstrate these
+methods and the specific challenges they address, we provide a detailed case
+study of a three-state model of vortex shedding in the wake of a bluff body
+immersed in a fluid, which has a two-dimensional slow manifold that can be
+computed analytically. In anticipation of future applications to
+high-dimensional systems, we also propose several techniques for constructing
+computationally efficient reduced-order models using our proposed nonlinear
+projection framework. This includes a novel sparsity-promoting penalty for the
+encoder that avoids detrimental weight matrix shrinkage via computation on the
+Grassmann manifold.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Approximation of Zonoids and Uniform Approximation by Shallow
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan W. Siegel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the following two related problems. The first is to determine to
+what error an arbitrary zonoid in $\mathbb{R}^{d+1}$ can be approximated in the
+Hausdorff distance by a sum of $n$ line segments. The second is to determine
+optimal approximation rates in the uniform norm for shallow ReLU$^k$ neural
+networks on their variation spaces. The first of these problems has been solved
+for $d\neq 2,3$, but when $d=2,3$ a logarithmic gap between the best upper and
+lower bounds remains. We close this gap, which completes the solution in all
+dimensions. For the second problem, our techniques significantly improve upon
+existing approximation rates when $k\geq 1$, and enable uniform approximation
+of both the target function and its derivatives.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recovering high-quality FODs from a reduced number of diffusion-weighted
+  images using a model-driven deep learning architecture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        J Bartlett, C E Davey, L A Johnston, J Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fibre orientation distribution (FOD) reconstruction using deep learning has
+the potential to produce accurate FODs from a reduced number of
+diffusion-weighted images (DWIs), decreasing total imaging time. Diffusion
+acquisition invariant representations of the DWI signals are typically used as
+input to these methods to ensure that they can be applied flexibly to data with
+different b-vectors and b-values; however, this means the network cannot
+condition its output directly on the DWI signal. In this work, we propose a
+spherical deconvolution network, a model-driven deep learning FOD
+reconstruction architecture, that ensures intermediate and output FODs produced
+by the network are consistent with the input DWI signals. Furthermore, we
+implement a fixel classification penalty within our loss function, encouraging
+the network to produce FODs that can subsequently be segmented into the correct
+number of fixels and improve downstream fixel-based analysis. Our results show
+that the model-based deep learning architecture achieves competitive
+performance compared to a state-of-the-art FOD super-resolution network,
+FOD-Net. Moreover, we show that the fixel classification penalty can be tuned
+to offer improved performance with respect to metrics that rely on accurately
+segmented of FODs. Our code is publicly available at
+https://github.com/Jbartlett6/SDNet .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures, This work has been submitted to the IEEE for
+  possible publication. Copyright may be transferred without notice, after
+  which this version may no longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is this model reliable for everyone? Testing for strong calibration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean Feng, Alexej Gossmann, Romain Pirracchio, Nicholas Petrick, Gene Pennello, Berkman Sahiner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a well-calibrated risk prediction model, the average predicted probability
+is close to the true event rate for any given subgroup. Such models are
+reliable across heterogeneous populations and satisfy strong notions of
+algorithmic fairness. However, the task of auditing a model for strong
+calibration is well-known to be difficult -- particularly for machine learning
+(ML) algorithms -- due to the sheer number of potential subgroups. As such,
+common practice is to only assess calibration with respect to a few predefined
+subgroups. Recent developments in goodness-of-fit testing offer potential
+solutions but are not designed for settings with weak signal or where the
+poorly calibrated subgroup is small, as they either overly subdivide the data
+or fail to divide the data at all. We introduce a new testing procedure based
+on the following insight: if we can reorder observations by their expected
+residuals, there should be a change in the association between the predicted
+and observed residuals along this sequence if a poorly calibrated subgroup
+exists. This lets us reframe the problem of calibration testing into one of
+changepoint detection, for which powerful methods already exist. We begin with
+introducing a sample-splitting procedure where a portion of the data is used to
+train a suite of candidate models for predicting the residual, and the
+remaining data are used to perform a score-based cumulative sum (CUSUM) test.
+To further improve power, we then extend this adaptive CUSUM test to
+incorporate cross-validation, while maintaining Type I error control under
+minimal assumptions. Compared to existing methods, the proposed procedure
+consistently achieved higher power in simulation studies and more than doubled
+the power when auditing a mortality risk prediction model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Practical Recipe for Federated Learning Under Statistical
+  Heterogeneity Experimental Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahdi Morafah, Weijia Wang, Bill Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) has been an area of active research in recent years.
+There have been numerous studies in FL to make it more successful in the
+presence of data heterogeneity. However, despite the existence of many
+publications, the state of progress in the field is unknown. Many of the works
+use inconsistent experimental settings and there are no comprehensive studies
+on the effect of FL-specific experimental variables on the results and
+practical insights for a more comparable and consistent FL experimental setup.
+Furthermore, the existence of several benchmarks and confounding variables has
+further complicated the issue of inconsistency and ambiguity. In this work, we
+present the first comprehensive study on the effect of FL-specific experimental
+variables in relation to each other and performance results, bringing several
+insights and recommendations for designing a meaningful and well-incentivized
+FL experimental setup. We further aid the community by releasing FedZoo-Bench,
+an open-source library based on PyTorch with pre-implementation of 22
+state-of-the-art methods, and a broad set of standardized and customizable
+features available at https://github.com/MMorafah/FedZoo-Bench. We also provide
+a comprehensive comparison of several state-of-the-art (SOTA) methods to better
+understand the current state of the field and existing limitations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Neural Networks based Meta-Learning for Network Intrusion Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09394v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09394v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anabia Sohail, Bibi Ayisha, Irfan Hameed, Muhammad Mohsin Zafar, Hani Alquhayz, Asifullah Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The digitization of different components of industry and inter-connectivity
+among indigenous networks have increased the risk of network attacks. Designing
+an intrusion detection system to ensure security of the industrial ecosystem is
+difficult as network traffic encompasses various attack types, including new
+and evolving ones with minor changes. The data used to construct a predictive
+model for computer networks has a skewed class distribution and limited
+representation of attack types, which differ from real network traffic. These
+limitations result in dataset shift, negatively impacting the machine learning
+models' predictive abilities and reducing the detection rate against novel
+attacks. To address the challenges, we propose a novel deep neural network
+based Meta-Learning framework; INformation FUsion and Stacking Ensemble
+(INFUSE) for network intrusion detection. First, a hybrid feature space is
+created by integrating decision and feature spaces. Five different classifiers
+are utilized to generate a pool of decision spaces. The feature space is then
+enriched through a deep sparse autoencoder that learns the semantic
+relationships between attacks. Finally, the deep Meta-Learner acts as an
+ensemble combiner to analyze the hybrid feature space and make a final
+decision. Our evaluation on stringent benchmark datasets and comparison to
+existing techniques showed the effectiveness of INFUSE with an F-Score of 0.91,
+Accuracy of 91.6%, and Recall of 0.94 on the Test+ dataset, and an F-Score of
+0.91, Accuracy of 85.6%, and Recall of 0.87 on the stringent Test-21 dataset.
+These promising results indicate the strong generalization capability and the
+potential to detect network attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pages: 15, Figures: 10 and Tables: 9</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Neural Networks and 3-Dimensional Topology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05966v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05966v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pavel Putrov, Song Jin Ri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We test the efficiency of applying Geometric Deep Learning to the problems in
+low-dimensional topology in a certain simple setting. Specifically, we consider
+the class of 3-manifolds described by plumbing graphs and use Graph Neural
+Networks (GNN) for the problem of deciding whether a pair of graphs give
+homeomorphic 3-manifolds. We use supervised learning to train a GNN that
+provides the answer to such a question with high accuracy. Moreover, we
+consider reinforcement learning by a GNN to find a sequence of Neumann moves
+that relates the pair of graphs if the answer is positive. The setting can be
+understood as a toy model of the problem of deciding whether a pair of Kirby
+diagrams give diffeomorphic 3- or 4-manifolds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages and appendix, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-modal Machine Learning in Engineering Design: A <span class="highlight-title">Review</span> and Future
+  Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.10909v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.10909v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binyang Song, Rui Zhou, Faez Ahmed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly advancing field of multi-modal machine learning (MMML), the
+convergence of multiple data modalities has the potential to reshape various
+applications. This paper presents a comprehensive overview of the current
+state, advancements, and challenges of MMML within the sphere of engineering
+design. The review begins with a deep dive into five fundamental concepts of
+MMML:multi-modal information representation, fusion, alignment, translation,
+and co-learning. Following this, we explore the cutting-edge applications of
+MMML, placing a particular emphasis on tasks pertinent to engineering design,
+such as cross-modal synthesis, multi-modal prediction, and cross-modal
+information retrieval. Through this comprehensive overview, we highlight the
+inherent challenges in adopting MMML in engineering design, and proffer
+potential directions for future research. To spur on the continued evolution of
+MMML in engineering design, we advocate for concentrated efforts to construct
+extensive multi-modal design datasets, develop effective data-driven MMML
+techniques tailored to design applications, and enhance the scalability and
+interpretability of MMML models. MMML models, as the next generation of
+intelligent design tools, hold a promising future to impact how products are
+designed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fairness-aware Online Price Discrimination with Nonparametric Demand
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.08221v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.08221v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Chen, Jiameng Lyu, Xuan Zhang, Yuan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Price discrimination, which refers to the strategy of setting different
+prices for different customer groups, has been widely used in online retailing.
+Although it helps boost the collected revenue for online retailers, it might
+create serious concerns about fairness, which even violates the regulation and
+laws. This paper studies the problem of dynamic discriminatory pricing under
+fairness constraints. In particular, we consider a finite selling horizon of
+length $T$ for a single product with two groups of customers. Each group of
+customers has its unknown demand function that needs to be learned. For each
+selling period, the seller determines the price for each group and observes
+their purchase behavior. While existing literature mainly focuses on maximizing
+revenue, ensuring fairness among different customers has not been fully
+explored in the dynamic pricing literature. This work adopts the fairness
+notion from Cohen et al. (2022). For price fairness, we propose an optimal
+dynamic pricing policy regarding regret, which enforces the strict price
+fairness constraint. In contrast to the standard $\sqrt{T}$-type regret in
+online learning, we show that the optimal regret in our case is
+$\tilde{O}(T^{4/5})$. We further extend our algorithm to a more general notion
+of fairness, which includes demand fairness as a special case. To handle this
+general class, we propose a soft fairness constraint and develop a dynamic
+pricing policy that achieves $\tilde{O}(T^{4/5})$ regret. We also demonstrate
+that our algorithmic techniques can be adapted to more general scenarios such
+as fairness among multiple groups of customers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>73 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Swarm Reinforcement Learning For Adaptive Mesh Refinement <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.00818v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.00818v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niklas Freymuth, Philipp Dahlinger, Tobias Würth, Simon Reisch, Luise Kärger, Gerhard Neumann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Finite Element Method, an important technique in engineering, is aided by
+Adaptive Mesh Refinement (AMR), which dynamically refines mesh regions to allow
+for a favorable trade-off between computational speed and simulation accuracy.
+Classical methods for AMR depend on task-specific heuristics or expensive error
+estimators, hindering their use for complex simulations. Recent learned AMR
+methods tackle these problems, but so far scale only to simple toy examples. We
+formulate AMR as a novel Adaptive Swarm Markov Decision Process in which a mesh
+is modeled as a system of simple collaborating agents that may split into
+multiple new agents. This framework allows for a spatial reward formulation
+that simplifies the credit assignment problem, which we combine with Message
+Passing Networks to propagate information between neighboring mesh elements. We
+experimentally validate the effectiveness of our approach, Adaptive Swarm Mesh
+Refinement (ASMR), showing that it learns reliable, scalable, and efficient
+refinement strategies on a set of challenging problems. Our approach
+significantly speeds up computation, achieving up to 30-fold improvement
+compared to uniform refinements in complex simulations. Additionally, we
+outperform learned baselines and achieve a refinement quality that is on par
+with a traditional error-based AMR strategy without expensive oracle
+information about the error signal.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Version 1 of this paper is a preliminary workshop version that was
+  accepted as a workshop paper in the ICLR 2023 Workshop on Physics for Machine
+  Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fail-Safe Adversarial Generative Imitation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.01696v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.01696v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Geiger, Christoph-Nikolas Straehle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For flexible yet safe imitation learning (IL), we propose theory and a
+modular method, with a safety layer that enables a closed-form probability
+density/gradient of the safe generative continuous policy, end-to-end
+generative adversarial training, and worst-case safety guarantees. The safety
+layer maps all actions into a set of safe actions, and uses the
+change-of-variables formula plus additivity of measures for the density. The
+set of safe actions is inferred by first checking safety of a finite sample of
+actions via adversarial reachability analysis of fallback maneuvers, and then
+concluding on the safety of these actions' neighborhoods using, e.g., Lipschitz
+continuity. We provide theoretical analysis showing the robustness advantage of
+using the safety layer already during training (imitation error linear in the
+horizon) compared to only using it at test time (up to quadratic error). In an
+experiment on real-world driver interaction data, we empirically demonstrate
+tractability, safety and imitation performance of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring the Carbon Footprint of Hugging Face's ML Models: A Repository
+  Mining Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11164v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11164v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joel Castaño, Silverio Martínez-Fernández, Xavier Franch, Justus Bogner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of machine learning (ML) systems has exacerbated their carbon
+footprint due to increased capabilities and model sizes. However, there is
+scarce knowledge on how the carbon footprint of ML models is actually measured,
+reported, and evaluated. In light of this, the paper aims to analyze the
+measurement of the carbon footprint of 1,417 ML models and associated datasets
+on Hugging Face, which is the most popular repository for pretrained ML models.
+The goal is to provide insights and recommendations on how to report and
+optimize the carbon efficiency of ML models. The study includes the first
+repository mining study on the Hugging Face Hub API on carbon emissions. This
+study seeks to answer two research questions: (1) how do ML model creators
+measure and report carbon emissions on Hugging Face Hub?, and (2) what aspects
+impact the carbon emissions of training ML models? The study yielded several
+key findings. These include a stalled proportion of carbon emissions-reporting
+models, a slight decrease in reported carbon footprint on Hugging Face over the
+past 2 years, and a continued dominance of NLP as the main application domain.
+Furthermore, the study uncovers correlations between carbon emissions and
+various attributes such as model size, dataset size, and ML application
+domains. These results highlight the need for software measurements to improve
+energy reporting practices and promote carbon-efficient model development
+within the Hugging Face community. In response to this issue, two
+classifications are proposed: one for categorizing models based on their carbon
+emission reporting practices and another for their carbon efficiency. The aim
+of these classification proposals is to foster transparency and sustainable
+model development within the ML community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 2023 ACM/IEEE International Symposium on Empirical
+  Software Engineering and Measurement (ESEM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Resource frugal optimizer for quantum machine learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.04965v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.04965v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charles Moussa, Max Hunter Gordon, Michal Baczyk, M. Cerezo, Lukasz Cincio, Patrick J. Coles
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum-enhanced data science, also known as quantum machine learning (QML),
+is of growing interest as an application of near-term quantum computers.
+Variational QML algorithms have the potential to solve practical problems on
+real hardware, particularly when involving quantum data. However, training
+these algorithms can be challenging and calls for tailored optimization
+procedures. Specifically, QML applications can require a large shot-count
+overhead due to the large datasets involved. In this work, we advocate for
+simultaneous random sampling over both the dataset as well as the measurement
+operators that define the loss function. We consider a highly general loss
+function that encompasses many QML applications, and we show how to construct
+an unbiased estimator of its gradient. This allows us to propose a shot-frugal
+gradient descent optimizer called Refoqus (REsource Frugal Optimizer for
+QUantum Stochastic gradient descent). Our numerics indicate that Refoqus can
+save several orders of magnitude in shot cost, even relative to optimizers that
+sample over measurement operators alone.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 6 figures - extra quantum autoencoder results added - extra
+  affiliation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Multimodal Prediction of Spontaneous Humour: A Novel <span class="highlight-title">Dataset</span> and
+  First Results 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.14272v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.14272v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Christ, Shahin Amiriparian, Alexander Kathan, Niklas Müller, Andreas König, Björn W. Schuller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humour is a substantial element of human affect and cognition. Its automatic
+understanding can facilitate a more naturalistic human-device interaction and
+the humanisation of artificial intelligence. Current methods of humour
+detection are solely based on staged data making them inadequate for
+'real-world' applications. We address this deficiency by introducing the novel
+Passau-Spontaneous Football Coach Humour (Passau-SFCH) dataset, comprising of
+about 11 hours of recordings. The Passau-SFCH dataset is annotated for the
+presence of humour and its dimensions (sentiment and direction) as proposed in
+Martin's Humor Style Questionnaire. We conduct a series of experiments,
+employing pretrained Transformers, convolutional neural networks, and
+expert-designed features. The performance of each modality (text, audio, video)
+for spontaneous humour recognition is analysed and their complementarity is
+investigated. Our findings suggest that for the automatic analysis of humour
+and its sentiment, facial expressions are most promising, while humour
+direction can be best modelled via text-based features. The results reveal
+considerable differences among various subjects, highlighting the individuality
+of humour usage and style. Further, we observe that a decision-level fusion
+yields the best recognition result. Finally, we make our code publicly
+available at https://www.github.com/EIHW/passau-sfch. The Passau-SFCH dataset
+is available upon request.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible (Major Revision)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automating Model Comparison in Factor Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05965v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05965v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bart van Erp, Wouter W. L. Nuijten, Thijs van de Laar, Bert de Vries
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian state and parameter estimation have been automated effectively in a
+variety of probabilistic programming languages. The process of model comparison
+on the other hand, which still requires error-prone and time-consuming manual
+derivations, is often overlooked despite its importance. This paper efficiently
+automates Bayesian model averaging, selection, and combination by message
+passing on a Forney-style factor graph with a custom mixture node. Parameter
+and state inference, and model comparison can then be executed simultaneously
+using message passing with scale factors. This approach shortens the model
+design cycle and allows for the straightforward extension to hierarchical and
+temporal model priors to accommodate for modeling complicated time-varying
+processes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expert-Free Online Transfer Learning in Multi-Agent Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01170v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01170v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alberto Castagna, Ivana Dusparic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transfer learning in Reinforcement Learning (RL) has been widely studied to
+overcome training issues of Deep-RL, i.e., exploration cost, data availability
+and convergence time, by introducing a way to enhance training phase with
+external knowledge. Generally, knowledge is transferred from expert-agents to
+novices. While this fixes the issue for a novice agent, a good understanding of
+the task on expert agent is required for such transfer to be effective. As an
+alternative, in this paper we propose Expert-Free Online Transfer Learning
+(EF-OnTL), an algorithm that enables expert-free real-time dynamic transfer
+learning in multi-agent system. No dedicated expert exists, and transfer source
+agent and knowledge to be transferred are dynamically selected at each transfer
+step based on agents' performance and uncertainty. To improve uncertainty
+estimation, we also propose State Action Reward Next-State Random Network
+Distillation (sars-RND), an extension of RND that estimates uncertainty from RL
+agent-environment interaction. We demonstrate EF-OnTL effectiveness against a
+no-transfer scenario and advice-based baselines, with and without expert
+agents, in three benchmark tasks: Cart-Pole, a grid-based Multi-Team
+Predator-Prey (mt-pp) and Half Field Offense (HFO). Our results show that
+EF-OnTL achieve overall comparable performance when compared against
+advice-based baselines while not requiring any external input nor threshold
+tuning. EF-OnTL outperforms no-transfer with an improvement related to the
+complexity of the task addressed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimizing Convolutional Neural Networks for Chronic Obstructive
+  Pulmonary Disease Detection in Clinical Computed Tomography Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07189v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07189v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tina Dorosti, Manuel Schultheiss, Felix Hofmann, Johannes Thalhammer, Luisa Kirchner, Theresa Urban, Franz Pfeiffer, Florian Schaff, Tobias Lasser, Daniela Pfeiffer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: To optimize the binary detection of Chronic Obstructive Pulmonary
+Disease (COPD) based on emphysema presence in the lung with convolutional
+neural networks (CNN) by exploring manually adjusted versus automated
+window-setting optimization (WSO) on computed tomography (CT) images.
+  Methods: 7,194 CT images (3,597 with COPD; 3,597 healthy controls) from 78
+subjects (43 with COPD; 35 healthy controls) were selected retrospectively
+(10.2018-12.2019) and preprocessed. For each image, intensity values were
+manually clipped to the emphysema window setting and a baseline 'full-range'
+window setting. Class-balanced train, validation, and test sets contained
+3,392, 1,114, and 2,688 images. The network backbone was optimized by comparing
+various CNN architectures. Furthermore, automated WSO was implemented by adding
+a customized layer to the model. The image-level area under the Receiver
+Operating Characteristics curve (AUC) [lower, upper limit 95% confidence] and
+P-values calculated from one-sided Mann-Whitney U-test were utilized to compare
+model variations.
+  Results: Repeated inference (n=7) on the test set showed that the DenseNet
+was the most efficient backbone and achieved a mean AUC of 0.80 [0.76, 0.85]
+without WSO. Comparably, with input images manually adjusted to the emphysema
+window, the DenseNet model predicted COPD with a mean AUC of 0.86 [0.82, 0.89]
+(P=0.03). By adding a customized WSO layer to the DenseNet, an optimal window
+in the proximity of the emphysema window setting was learned automatically, and
+a mean AUC of 0.82 [0.78, 0.86] was achieved.
+  Conclusion: Detection of COPD with DenseNet models was improved by WSO of CT
+data to the emphysema window setting range.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solving Data Quality Problems with Desbordante: a Demo 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14935v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14935v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George Chernishev, Michael Polyntsov, Anton Chizhov, Kirill Stupakov, Ilya Shchuckin, Alexander Smirnov, Maxim Strutovsky, Alexey Shlyonskikh, Mikhail Firsov, Stepan Manannikov, Nikita Bobrov, Daniil Goncharov, Ilia Barutkin, Vladislav Shalnev, Kirill Muraviev, Anna Rakhmukova, Dmitriy Shcheka, Anton Chernikov, Mikhail Vyrodov, Yaroslav Kurbatov, Maxim Fofanov, Sergei Belokonnyi, Pavel Anosov, Arthur Saliou, Eduard Gaisin, Kirill Smirnov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data profiling is an essential process in modern data-driven industries. One
+of its critical components is the discovery and validation of complex
+statistics, including functional dependencies, data constraints, association
+rules, and others.
+  However, most existing data profiling systems that focus on complex
+statistics do not provide proper integration with the tools used by
+contemporary data scientists. This creates a significant barrier to the
+adoption of these tools in the industry. Moreover, existing systems were not
+created with industrial-grade workloads in mind. Finally, they do not aim to
+provide descriptive explanations, i.e. why a given pattern is not found. It is
+a significant issue as it is essential to understand the underlying reasons for
+a specific pattern's absence to make informed decisions based on the data.
+  Because of that, these patterns are effectively rest in thin air: their
+application scope is rather limited, they are rarely used by the broader
+public. At the same time, as we are going to demonstrate in this presentation,
+complex statistics can be efficiently used to solve many classic data quality
+problems.
+  Desbordante is an open-source data profiler that aims to close this gap. It
+is built with emphasis on industrial application: it is efficient, scalable,
+resilient to crashes, and provides explanations. Furthermore, it provides
+seamless Python integration by offloading various costly operations to the C++
+core, not only mining.
+  In this demonstration, we show several scenarios that allow end users to
+solve different data quality problems. Namely, we showcase typo detection, data
+deduplication, and data anomaly detection scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Silos: Increased Modularity in Intra-organizational
+  Communication Networks during the Covid-19 Pandemic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2104.00641v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2104.00641v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tiona Zuzul, Emily Cox Pahnke, Jonathan Larson, Patrick Bourke, Nicholas Caurvina, Neha Parikh Shah, Fereshteh Amini, Jeffrey Weston, Youngser Park, Joshua Vogelstein, Christopher White, Carey E. Priebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Workplace communications around the world were drastically altered by
+Covid-19, related work-from-home orders, and the rise of remote work. To
+understand these shifts, we analyzed aggregated, anonymized metadata from over
+360 billion emails within 4,361 organizations worldwide. By comparing
+month-to-month and year-over-year metrics, we examined changes in network
+community structures over 24 months before and after Covid-19. We also examined
+shifts across multiple communication media (email, instant messages, video
+calls, and calendaring software) within a single global organization, and
+compared them to communications shifts that were driven by changes in formal
+organizational structure. We found that, in 2020, organizations around the
+world became more siloed than in 2019, evidenced by increased modularity. This
+shift was concurrent with decreased stability within silos. Collectively, our
+analyses indicate that following the onset of Covid-19, employees began to
+shift more dynamically between subcommunities (teams, workgroups or functional
+areas). At the same time, once in a subcommunity, they limited their
+communication to other members of that community. We term these network changes
+dynamic silos. We provide initial insights into the meaning and implications of
+dynamic silos for the future of work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>48 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FACT: Federated Adversarial Cross Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00607v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00607v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Schrod, Jonas Lippl, Andreas Schäfer, Michael Altenbuchinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) facilitates distributed model development to
+aggregate multiple confidential data sources. The information transfer among
+clients can be compromised by distributional differences, i.e., by non-i.i.d.
+data. A particularly challenging scenario is the federated model adaptation to
+a target client without access to annotated data. We propose Federated
+Adversarial Cross Training (FACT), which uses the implicit domain differences
+between source clients to identify domain shifts in the target domain. In each
+round of FL, FACT cross initializes a pair of source clients to generate domain
+specialized representations which are then used as a direct adversary to learn
+a domain invariant data representation. We empirically show that FACT
+outperforms state-of-the-art federated, non-federated and source-free domain
+adaptation models on three popular multi-source-single-target benchmarks, and
+state-of-the-art Unsupervised Domain Adaptation (UDA) models on
+single-source-single-target experiments. We further study FACT's behavior with
+respect to communication restrictions and the number of participating clients.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Feature-based Deep Reinforcement Learning for Flow Control of
+  Circular Cylinder with Sparse Surface Pressure Sensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01995v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01995v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiulei Wang, Lei Yan, Gang Hu, Wenli Chen, Bernd R. Noack
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes a self-learning algorithm for closed-loop cylinder wake
+control targeting lower drag and lower lift fluctuations with the additional
+challenge of sparse sensor information, taking deep reinforcement learning as
+the starting point. DRL performance is significantly improved by lifting the
+sensor signals to dynamic features (DF), which predict future flow states. The
+resulting dynamic feature-based DRL (DF-DRL) automatically learns a feedback
+control in the plant without a dynamic model. Results show that the drag
+coefficient of the DF-DRL model is 25% less than the vanilla model based on
+direct sensor feedback. More importantly, using only one surface pressure
+sensor, DF-DRL can reduce the drag coefficient to a state-of-the-art
+performance of about 8% at Re = 100 and significantly mitigate lift coefficient
+fluctuations. Hence, DF-DRL allows the deployment of sparse sensing of the flow
+without degrading the control performance. This method also shows good
+robustness in controlling flow under higher Reynolds numbers, which reduces the
+drag coefficient by 32.2% and 46.55% at Re = 500 and 1000, respectively,
+indicating the broad applicability of the method. Since surface pressure
+information is more straightforward to measure in realistic scenarios than flow
+velocity information, this study provides a valuable reference for
+experimentally designing the active flow control of a circular cylinder based
+on wall pressure signals, which is an essential step toward further developing
+intelligent control in realistic multi-input multi-output (MIMO) system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Post-Episodic Reinforcement Learning Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08854v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08854v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vasilis Syrgkanis, Ruohan Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider estimation and inference with data collected from episodic
+reinforcement learning (RL) algorithms; i.e. adaptive experimentation
+algorithms that at each period (aka episode) interact multiple times in a
+sequential manner with a single treated unit. Our goal is to be able to
+evaluate counterfactual adaptive policies after data collection and to estimate
+structural parameters such as dynamic treatment effects, which can be used for
+credit assignment (e.g. what was the effect of the first period action on the
+final outcome). Such parameters of interest can be framed as solutions to
+moment equations, but not minimizers of a population loss function, leading to
+$Z$-estimation approaches in the case of static data. However, such estimators
+fail to be asymptotically normal in the case of adaptive data collection. We
+propose a re-weighted $Z$-estimation approach with carefully designed adaptive
+weights to stabilize the episode-varying estimation variance, which results
+from the nonstationary policy that typical episodic RL algorithms invoke. We
+identify proper weighting schemes to restore the consistency and asymptotic
+normality of the re-weighted Z-estimators for target parameters, which allows
+for hypothesis testing and constructing uniform confidence regions for target
+parameters of interest. Primary applications include dynamic treatment effect
+estimation and dynamic off-policy evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Deep Learning Approach for Overall Survival Prediction in Lung Cancer
+  with Missing Values 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11465v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11465v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Camillo Maria Caruso, Valerio Guarrasi, Sara Ramella, Paolo Soda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the most challenging fields where Artificial Intelligence (AI) can be
+applied is lung cancer research, specifically non-small cell lung cancer
+(NSCLC). In particular, overall survival (OS), the time between diagnosis and
+death, is a vital indicator of patient status, enabling tailored treatment and
+improved OS rates. In this analysis, there are two challenges to take into
+account. First, few studies effectively exploit the information available from
+each patient, leveraging both uncensored (i.e., dead) and censored (i.e.,
+survivors) patients, considering also the events' time. Second, the handling of
+incomplete data is a common issue in the medical field. This problem is
+typically tackled through the use of imputation methods. Our objective is to
+present an AI model able to overcome these limits, effectively learning from
+both censored and uncensored patients and their available features, for the
+prediction of OS for NSCLC patients. We present a novel approach to survival
+analysis with missing values in the context of NSCLC, which exploits the
+strengths of the transformer architecture to account only for available
+features without requiring any imputation strategy. By making use of ad-hoc
+losses for OS, it is able to account for both censored and uncensored patients,
+as well as changes in risks over time. We compared our method with
+state-of-the-art models for survival analysis coupled with different imputation
+strategies. We evaluated the results obtained over a period of 6 years using
+different time granularities obtaining a Ct-index, a time-dependent variant of
+the C-index, of 71.97, 77.58 and 80.72 for time units of 1 month, 1 year and 2
+years, respectively, outperforming all state-of-the-art methods regardless of
+the imputation method used.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Conditional Flow Variational Autoencoder for Controllable Synthesis of
+  Virtual Populations of Anatomy <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14680v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14680v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Dou, Nishant Ravikumar, Alejandro F. Frangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generation of virtual populations (VPs) of anatomy is essential for
+conducting in silico trials of medical devices. Typically, the generated VP
+should capture sufficient variability while remaining plausible and should
+reflect the specific characteristics and demographics of the patients observed
+in real populations. In several applications, it is desirable to synthesise
+virtual populations in a \textit{controlled} manner, where relevant covariates
+are used to conditionally synthesise virtual populations that fit a specific
+target population/characteristics. We propose to equip a conditional
+variational autoencoder (cVAE) with normalising flows to boost the flexibility
+and complexity of the approximate posterior learnt, leading to enhanced
+flexibility for controllable synthesis of VPs of anatomical structures. We
+demonstrate the performance of our conditional flow VAE using a data set of
+cardiac left ventricles acquired from 2360 patients, with associated
+demographic information and clinical measurements (used as
+covariates/conditional information). The results obtained indicate the
+superiority of the proposed method for conditional synthesis of virtual
+populations of cardiac left ventricles relative to a cVAE. Conditional
+synthesis performance was evaluated in terms of generalisation and specificity
+errors and in terms of the ability to preserve clinically relevant biomarkers
+in synthesised VPs, that is, the left ventricular blood pool and myocardial
+volume, relative to the real observed population.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Intersection Management in Mixed Traffic Using Reinforcement
+  Learning and Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.12717v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.12717v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marvin Klimke, Benjamin Völz, Michael Buchholz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Connected automated driving has the potential to significantly improve urban
+traffic efficiency, e.g., by alleviating issues due to occlusion. Cooperative
+behavior planning can be employed to jointly optimize the motion of multiple
+vehicles. Most existing approaches to automatic intersection management,
+however, only consider fully automated traffic. In practice, mixed traffic,
+i.e., the simultaneous road usage by automated and human-driven vehicles, will
+be prevalent. The present work proposes to leverage reinforcement learning and
+a graph-based scene representation for cooperative multi-agent planning. We
+build upon our previous works that showed the applicability of such machine
+learning methods to fully automated traffic. The scene representation is
+extended for mixed traffic and considers uncertainty in the human drivers'
+intentions. In the simulation-based evaluation, we model measurement
+uncertainties through noise processes that are tuned using real-world data. The
+paper evaluates the proposed method against an enhanced first in - first out
+scheme, our baseline for mixed traffic management. With increasing share of
+automated vehicles, the learned planner significantly increases the vehicle
+throughput and reduces the delay due to interaction. Non-automated vehicles
+benefit virtually alike.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures, 34th IEEE Intelligent Vehicles Symposium (IV),
+  updated to accepted version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Self-Adaptive Penalty Method for Integrating Prior Knowledge
+  Constraints into Neural ODEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14940v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14940v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        C. Coelho, M. Fernanda P. Costa, L. L. Ferrás
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The continuous dynamics of natural systems has been effectively modelled
+using Neural Ordinary Differential Equations (Neural ODEs). However, for
+accurate and meaningful predictions, it is crucial that the models follow the
+underlying rules or laws that govern these systems. In this work, we propose a
+self-adaptive penalty algorithm for Neural ODEs to enable modelling of
+constrained natural systems. The proposed self-adaptive penalty function can
+dynamically adjust the penalty parameters. The explicit introduction of prior
+knowledge helps to increase the interpretability of Neural ODE -based models.
+We validate the proposed approach by modelling three natural systems with prior
+knowledge constraints: population growth, chemical reaction evolution, and
+damped harmonic oscillator motion. The numerical experiments and a comparison
+with other penalty Neural ODE approaches and \emph{vanilla} Neural ODE,
+demonstrate the effectiveness of the proposed self-adaptive penalty algorithm
+for Neural ODEs in modelling constrained natural systems. Moreover, the
+self-adaptive penalty approach provides more accurate and robust models with
+reliable and meaningful predictions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Shapley Curves: A Smoothing Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.13289v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.13289v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ratmir Miftachov, Georg Keilbar, Wolfgang Karl Härdle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Originating from cooperative game theory, Shapley values have become one of
+the most widely used measures for variable importance in applied Machine
+Learning. However, the statistical understanding of Shapley values is still
+limited. In this paper, we take a nonparametric (or smoothing) perspective by
+introducing Shapley curves as a local measure of variable importance. We
+consider two estimation strategies and derive the consistency and asymptotic
+normality both under independence and dependence among the features. We further
+propose a novel version of the wild bootstrap procedure specifically adjusted
+for Shapley curves. This allows us to construct confidence intervals and
+conduct inference. The asymptotic results are validated in extensive
+experiments. In an empirical application, we analyze which attributes drive the
+prices of vehicles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-Domain Evaluation of a Deep Learning-Based Type Inference System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.09189v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.09189v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bernd Gruner, Tim Sonnekalb, Thomas S. Heinze, Clemens-Alexander Brust
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optional type annotations allow for enriching dynamic programming languages
+with static typing features like better Integrated Development Environment
+(IDE) support, more precise program analysis, and early detection and
+prevention of type-related runtime errors. Machine learning-based type
+inference promises interesting results for automating this task. However, the
+practical usage of such systems depends on their ability to generalize across
+different domains, as they are often applied outside their training domain. In
+this work, we investigate Type4Py as a representative of state-of-the-art deep
+learning-based type inference systems, by conducting extensive cross-domain
+experiments. Thereby, we address the following problems: class imbalances,
+out-of-vocabulary words, dataset shifts, and unknown classes. To perform such
+experiments, we use the datasets ManyTypes4Py and CrossDomainTypes4Py. The
+latter we introduce in this paper. Our dataset enables the evaluation of type
+inference systems in different domains of software projects and has over
+1,000,000 type annotations mined on the platforms GitHub and Libraries. It
+consists of data from the two domains web development and scientific
+calculation. Through our experiments, we detect that the shifts in the dataset
+and the long-tailed distribution with many rare and unknown data types decrease
+the performance of the deep learning-based type inference system drastically.
+In this context, we test unsupervised domain adaptation methods and fine-tuning
+to overcome these issues. Moreover, we investigate the impact of
+out-of-vocabulary words.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint for the MSR'23 technical track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised ANN-Based Equalizer and Its Trainable FPGA Implementation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06987v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06987v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Ney, Vincent Lauinger, Laurent Schmalen, Norbert Wehn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, communication engineers put strong emphasis on artificial
+neural network (ANN)-based algorithms with the aim of increasing the
+flexibility and autonomy of the system and its components. In this context,
+unsupervised training is of special interest as it enables adaptation without
+the overhead of transmitting pilot symbols. In this work, we present a novel
+ANN-based, unsupervised equalizer and its trainable field programmable gate
+array (FPGA) implementation. We demonstrate that our custom loss function
+allows the ANN to adapt for varying channel conditions, approaching the
+performance of a supervised baseline. Furthermore, as a first step towards a
+practical communication system, we design an efficient FPGA implementation of
+our proposed algorithm, which achieves a throughput in the order of Gbit/s,
+outperforming a high-performance GPU by a large margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted and presented at Joint European Conference on Networks and
+  Communications & 6G Summit (EuCNC/6G Summit), Gothenburg, Sweden, 6 - 9 June
+  2023; Published in IEEE Xplore: https://ieeexplore.ieee.org/document/10188269</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Provably Stabilizing Neural Controllers for Discrete-Time
+  Stochastic Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.05304v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.05304v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matin Ansaripour, Krishnendu Chatterjee, Thomas A. Henzinger, Mathias Lechner, Đorđe Žikelić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of learning control policies in discrete-time
+stochastic systems which guarantee that the system stabilizes within some
+specified stabilization region with probability~$1$. Our approach is based on
+the novel notion of stabilizing ranking supermartingales (sRSMs) that we
+introduce in this work. Our sRSMs overcome the limitation of methods proposed
+in previous works whose applicability is restricted to systems in which the
+stabilizing region cannot be left once entered under any control policy. We
+present a learning procedure that learns a control policy together with an sRSM
+that formally certifies probability~$1$ stability, both learned as neural
+networks. We show that this procedure can also be adapted to formally verifying
+that, under a given Lipschitz continuous control policy, the stochastic system
+stabilizes within some stabilizing region with probability~$1$. Our
+experimental evaluation shows that our learning procedure can successfully
+learn provably stabilizing policies in practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ATVA 2023. Follow-up work of arXiv:2112.09495</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Earning Extra Performance from Restrictive Feedbacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14831v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14831v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Li, Yuangang Pan, Yueming Lyu, Yinghua Yao, Yulei Sui, Ivor W. Tsang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many machine learning applications encounter a situation where model
+providers are required to further refine the previously trained model so as to
+gratify the specific need of local users. This problem is reduced to the
+standard model tuning paradigm if the target data is permissibly fed to the
+model. However, it is rather difficult in a wide range of practical cases where
+target data is not shared with model providers but commonly some evaluations
+about the model are accessible. In this paper, we formally set up a challenge
+named \emph{Earning eXtra PerformancE from restriCTive feEDdbacks} (EXPECTED)
+to describe this form of model tuning problems. Concretely, EXPECTED admits a
+model provider to access the operational performance of the candidate model
+multiple times via feedback from a local user (or a group of users). The goal
+of the model provider is to eventually deliver a satisfactory model to the
+local user(s) by utilizing the feedbacks. Unlike existing model tuning methods
+where the target data is always ready for calculating model gradients, the
+model providers in EXPECTED only see some feedbacks which could be as simple as
+scalars, such as inference accuracy or usage rate. To enable tuning in this
+restrictive circumstance, we propose to characterize the geometry of the model
+performance with regard to model parameters through exploring the parameters'
+distribution. In particular, for the deep models whose parameters distribute
+across multiple layers, a more query-efficient algorithm is further
+tailor-designed that conducts layerwise tuning with more attention to those
+layers which pay off better. Extensive experiments on different applications
+demonstrate that our work forges a sound solution to the EXPECTED problem. Code
+is available via https://github.com/kylejingli/EXPECTED.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE TPAMI in April 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Definition of Non-Stationary Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12202v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12202v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueyang Liu, Xu Kuang, Benjamin Van Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the subject of non-stationary bandit learning having attracted much
+recent attention, we have yet to identify a formal definition of
+non-stationarity that can consistently distinguish non-stationary bandits from
+stationary ones. Prior work has characterized non-stationary bandits as bandits
+for which the reward distribution changes over time. We demonstrate that this
+definition can ambiguously classify the same bandit as both stationary and
+non-stationary; this ambiguity arises in the existing definition's dependence
+on the latent sequence of reward distributions. Moreover, the definition has
+given rise to two widely used notions of regret: the dynamic regret and the
+weak regret. These notions are not indicative of qualitative agent performance
+in some bandits. Additionally, this definition of non-stationary bandits has
+led to the design of agents that explore excessively. We introduce a formal
+definition of non-stationary bandits that resolves these issues. Our new
+definition provides a unified approach, applicable seamlessly to both Bayesian
+and frequentist formulations of bandits. Furthermore, our definition ensures
+consistent classification of two bandits offering agents indistinguishable
+experiences, categorizing them as either both stationary or both
+non-stationary. This advancement provides a more robust framework for
+non-stationary bandit learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Consistent Range Approximation for Fair Predictive Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.10839v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.10839v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiongli Zhu, Sainyam Galhotra, Nazanin Sabri, Babak Salimi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel framework for certifying the fairness of
+predictive models trained on biased data. It draws from query answering for
+incomplete and inconsistent databases to formulate the problem of consistent
+range approximation (CRA) of fairness queries for a predictive model on a
+target population. The framework employs background knowledge of the data
+collection process and biased data, working with or without limited statistics
+about the target population, to compute a range of answers for fairness
+queries. Using CRA, the framework builds predictive models that are certifiably
+fair on the target population, regardless of the availability of external data
+during training. The framework's efficacy is demonstrated through evaluations
+on real data, showing substantial improvement over existing state-of-the-art
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Duet: efficient and scalable hybriD neUral rElation undersTanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13494v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13494v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaixin Zhang, Hongzhi Wang, Yabin Lu, Ziqi Li, Chang Shu, Yu Yan, Donghua Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learned cardinality estimation methods have achieved high precision compared
+to traditional methods. Among learned methods, query-driven approaches face the
+data and workload drift problem for a long time. Although both query-driven and
+hybrid methods are proposed to avoid this problem, even the state-of-the-art of
+them suffer from high training and estimation costs, limited scalability,
+instability, and long-tailed distribution problem on high cardinality and
+high-dimensional tables, which seriously affects the practical application of
+learned cardinality estimators. In this paper, we prove that most of these
+problems are directly caused by the widely used progressive sampling. We solve
+this problem by introducing predicates information into the autoregressive
+model and propose Duet, a stable, efficient, and scalable hybrid method to
+estimate cardinality directly without sampling or any non-differentiable
+process, which can not only reduces the inference complexity from O(n) to O(1)
+compared to Naru and UAE but also achieve higher accuracy on high cardinality
+and high-dimensional tables. Experimental results show that Duet can achieve
+all the design goals above and be much more practical and even has a lower
+inference cost on CPU than that of most learned methods on GPU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Domain Learning From Insufficient Annotations <span class="chip">ECAI-23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02757v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02757v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui He, Shengcai Liu, Jiahao Wu, Shan He, Ke Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-domain learning (MDL) refers to simultaneously constructing a model or
+a set of models on datasets collected from different domains. Conventional
+approaches emphasize domain-shared information extraction and domain-private
+information preservation, following the shared-private framework (SP models),
+which offers significant advantages over single-domain learning. However, the
+limited availability of annotated data in each domain considerably hinders the
+effectiveness of conventional supervised MDL approaches in real-world
+applications. In this paper, we introduce a novel method called multi-domain
+contrastive learning (MDCL) to alleviate the impact of insufficient annotations
+by capturing both semantic and structural information from both labeled and
+unlabeled data.Specifically, MDCL comprises two modules: inter-domain semantic
+alignment and intra-domain contrast. The former aims to align annotated
+instances of the same semantic category from distinct domains within a shared
+hidden space, while the latter focuses on learning a cluster structure of
+unlabeled instances in a private hidden space for each domain. MDCL is readily
+compatible with many SP models, requiring no additional model parameters and
+allowing for end-to-end training. Experimental results across five textual and
+image multi-domain datasets demonstrate that MDCL brings noticeable improvement
+over various SP models.Furthermore, MDCL can further be employed in
+multi-domain active learning (MDAL) to achieve a superior initialization,
+eventually leading to better overall performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted to ECAI-23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-fidelity wavelet neural operator with application to uncertainty
+  quantification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.05606v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.05606v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akshay Thakur, Tapas Tripura, Souvik Chakraborty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Operator learning frameworks, because of their ability to learn nonlinear
+maps between two infinite dimensional functional spaces and utilization of
+neural networks in doing so, have recently emerged as one of the more pertinent
+areas in the field of applied machine learning. Although these frameworks are
+extremely capable when it comes to modeling complex phenomena, they require an
+extensive amount of data for successful training which is often not available
+or is too expensive. However, this issue can be alleviated with the use of
+multi-fidelity learning, where a model is trained by making use of a large
+amount of inexpensive low-fidelity data along with a small amount of expensive
+high-fidelity data. To this end, we develop a new framework based on the
+wavelet neural operator which is capable of learning from a multi-fidelity
+dataset. The developed model's excellent learning capabilities are demonstrated
+by solving different problems which require effective correlation learning
+between the two fidelities for surrogate construction. Furthermore, we also
+assess the application of the developed framework for uncertainty
+quantification. The results obtained from this work illustrate the excellent
+performance of the proposed framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SKTR: Trace Recovery from Stochastically Known Logs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.12672v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.12672v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eli Bogdanov, Izack Cohen, Avigdor Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developments in machine learning together with the increasing usage of sensor
+data challenge the reliance on deterministic logs, requiring new process mining
+solutions for uncertain, and in particular stochastically known, logs. In this
+work we formulate {trace recovery}, the task of generating a deterministic log
+from stochastically known logs that is as faithful to reality as possible. An
+effective trace recovery algorithm would be a powerful aid for maintaining
+credible process mining tools for uncertain settings. We propose an algorithmic
+framework for this task that recovers the best alignment between a
+stochastically known log and a process model, with three innovative features.
+Our algorithm, SKTR, 1) handles both Markovian and non-Markovian processes; 2)
+offers a quality-based balance between a process model and a log, depending on
+the available process information, sensor quality, and machine learning
+predictiveness power; and 3) offers a novel use of a synchronous product
+multigraph to create the log. An empirical analysis using five publicly
+available datasets, three of which use predictive models over standard video
+capturing benchmarks, shows an average relative accuracy improvement of more
+than 10 over a common baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted version -- Accepted to the 5th International Conference on
+  Process Mining (ICPM), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating spectral bias for the multiscale operator learning with
+  hierarchical attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.10890v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.10890v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinliang Liu, Bo Xu, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural operators have emerged as a powerful tool for learning the mapping
+between infinite-dimensional parameter and solution spaces of partial
+differential equations (PDEs). In this work, we focus on multiscale PDEs that
+have important applications such as reservoir modeling and turbulence
+prediction. We demonstrate that for such PDEs, the spectral bias towards
+low-frequency components presents a significant challenge for existing neural
+operators. To address this challenge, we propose a hierarchical attention
+neural operator (HANO) inspired by the hierarchical matrix approach. HANO
+features a scale-adaptive interaction range and self-attentions over a
+hierarchy of levels, enabling nested feature computation with controllable
+linear cost and encoding/decoding of multiscale solution space. We also
+incorporate an empirical $H^1$ loss function to enhance the learning of
+high-frequency components. Our numerical experiments demonstrate that HANO
+outperforms state-of-the-art (SOTA) methods for representative multiscale
+problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ No-Regret Constrained Bayesian Optimization of Noisy and Expensive
+  Hybrid Models using Differentiable Quantile Function Approximations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03824v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03824v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Congwen Lu, Joel A. Paulson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the problem of efficient constrained global
+optimization of hybrid models that are a composition of a known white-box
+function and an expensive multi-output black-box function subject to noisy
+observations, which often arises in real-world science and engineering
+applications. We propose a novel method, Constrained Upper Quantile Bound
+(CUQB), to solve such problems that directly exploits the composite structure
+of the objective and constraint functions that we show leads substantially
+improved sampling efficiency. CUQB is a conceptually simple, deterministic
+approach that avoid constraint approximations used by previous methods.
+Although the CUQB acquisition function is not available in closed form, we
+propose a novel differentiable sample average approximation that enables it to
+be efficiently maximized. We further derive bounds on the cumulative regret and
+constraint violation under a non-parametric Bayesian representation of the
+black-box function. Since these bounds depend sublinearly on the number of
+iterations under some regularity assumptions, we establis bounds on the
+convergence rate to the optimal solution of the original constrained problem.
+In contrast to most existing methods, CUQB further incorporates a simple
+infeasibility detection scheme, which we prove triggers in a finite number of
+iterations when the original problem is infeasible (with high probability given
+the Bayesian model). Numerical experiments on several test problems, including
+environmental model calibration and real-time optimization of a reactor system,
+show that CUQB significantly outperforms traditional Bayesian optimization in
+both constrained and unconstrained cases. Furthermore, compared to other
+state-of-the-art methods that exploit composite structure, CUQB achieves
+competitive empirical performance while also providing substantially improved
+theoretical guarantees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ARB: Advanced Reasoning Benchmark for Large Language Models <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13692v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13692v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomohiro Sawada, Daniel Paleka, Alexander Havrilla, Pranav Tadepalli, Paula Vidas, Alexander Kranias, John J. Nay, Kshitij Gupta, Aran Komatsuzaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable performance on
+various quantitative reasoning and knowledge benchmarks. However, many of these
+benchmarks are losing utility as LLMs get increasingly high scores, despite not
+yet reaching expert performance in these domains. We introduce ARB, a novel
+benchmark composed of advanced reasoning problems in multiple fields. ARB
+presents a more challenging test than prior benchmarks, featuring problems in
+mathematics, physics, biology, chemistry, and law. As a subset of ARB, we
+introduce a challenging set of math and physics problems which require advanced
+symbolic reasoning and domain knowledge. We evaluate recent models such as
+GPT-4 and Claude on ARB and demonstrate that current models score well below
+50% on more demanding tasks. In order to improve both automatic and assisted
+evaluation capabilities, we introduce a rubric-based evaluation approach,
+allowing GPT-4 to score its own intermediate reasoning steps. Further, we
+conduct a human evaluation of the symbolic subset of ARB, finding promising
+agreement between annotators and GPT-4 rubric evaluation scores.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to NeurIPS Datasets and Benchmarks Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Bi-level Nonlinear Eigenvector Algorithm for Wasserstein Discriminant
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.11891v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.11891v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Min Roh, Zhaojun Bai, Ren-Cang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Much like the classical Fisher linear discriminant analysis (LDA), the
+recently proposed Wasserstein discriminant analysis (WDA) is a linear
+dimensionality reduction method that seeks a projection matrix to maximize the
+dispersion of different data classes and minimize the dispersion of same data
+classes via a bi-level optimization. In contrast to LDA, WDA can account for
+both global and local interconnections between data classes by using the
+underlying principles of optimal transport. In this paper, a bi-level nonlinear
+eigenvector algorithm (WDA-nepv) is presented to fully exploit the structures
+of the bi-level optimization of WDA. The inner level of WDA-nepv for computing
+the optimal transport matrices is formulated as an eigenvector-dependent
+nonlinear eigenvalue problem (NEPv), and meanwhile, the outer level for trace
+ratio optimizations is formulated as another NEPv. Both NEPvs can be computed
+efficiently under the self-consistent field (SCF) framework. WDA-nepv is
+derivative-free and surrogate-model-free when compared with existing
+algorithms. Convergence analysis of the proposed WDA-nepv justifies the
+utilization of the SCF for solving the bi-level optimization of WDA. Numerical
+experiments with synthetic and real-life datasets demonstrate the
+classification accuracy and scalability of WDA-nepv.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Creating a <span class="highlight-title">Dataset</span> for High-Performance Computing Code Translation: A
+  Bridge Between HPC Fortran and C++ 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07686v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07686v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Lei, Caiwen Ding, Le Chen, Pei-Hung Lin, Chunhua Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we present a novel dataset for training machine learning
+models translating between OpenMP Fortran and C++ code. To ensure reliability
+and applicability, the dataset is initially refined using a meticulous code
+similarity test. The effectiveness of our dataset is assessed using both
+quantitative (CodeBLEU) and qualitative (human evaluation) methods. We
+demonstrate how this dataset can significantly improve the translation
+capabilities of large-scale language models, with improvements of
+$\mathbf{\times 5.1}$ for models with no prior coding knowledge and
+$\mathbf{\times 9.9}$ for models with some coding familiarity. Our work
+highlights the potential of this dataset to advance the field of code
+translation for high-performance computing. The dataset is available at
+https://github.com/bin123apple/Fortran-CPP-HPC-code-translation-dataset
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SynthA1c: Towards Clinically Interpretable Patient Representations for
+  Diabetes Risk Stratification <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.10043v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.10043v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael S. Yao, Allison Chae, Matthew T. MacLean, Anurag Verma, Jeffrey Duda, James Gee, Drew A. Torigian, Daniel Rader, Charles Kahn, Walter R. Witschey, Hersh Sagreiya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early diagnosis of Type 2 Diabetes Mellitus (T2DM) is crucial to enable
+timely therapeutic interventions and lifestyle modifications. As the time
+available for clinical office visits shortens and medical imaging data become
+more widely available, patient image data could be used to opportunistically
+identify patients for additional T2DM diagnostic workup by physicians. We
+investigated whether image-derived phenotypic data could be leveraged in
+tabular learning classifier models to predict T2DM risk in an automated fashion
+to flag high-risk patients without the need for additional blood laboratory
+measurements. In contrast to traditional binary classifiers, we leverage neural
+networks and decision tree models to represent patient data as 'SynthA1c'
+latent variables, which mimic blood hemoglobin A1c empirical lab measurements,
+that achieve sensitivities as high as 87.6%. To evaluate how SynthA1c models
+may generalize to other patient populations, we introduce a novel generalizable
+metric that uses vanilla data augmentation techniques to predict model
+performance on input out-of-domain covariates. We show that image-derived
+phenotypes and physical examination data together can accurately predict
+diabetes risk as a means of opportunistic risk stratification enabled by
+artificial intelligence and medical imaging. Our code is available at
+https://github.com/allisonjchae/DMT2RiskAssessment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages. Accepted to PRIME MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data Augmentation is a Hyperparameter: Cherry-picked Self-Supervision
+  for Unsupervised Anomaly Detection is Creating the Illusion of Success 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.07734v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.07734v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaemin Yoo, Tiancheng Zhao, Leman Akoglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) has emerged as a promising alternative to
+create supervisory signals to real-world problems, avoiding the extensive cost
+of manual labeling. SSL is particularly attractive for unsupervised tasks such
+as anomaly detection (AD), where labeled anomalies are rare or often
+nonexistent. A large catalog of augmentation functions has been used for
+SSL-based AD (SSAD) on image data, and recent works have reported that the type
+of augmentation has a significant impact on accuracy. Motivated by those, this
+work sets out to put image-based SSAD under a larger lens and investigate the
+role of data augmentation in SSAD. Through extensive experiments on 3 different
+detector models and across 420 AD tasks, we provide comprehensive numerical and
+visual evidences that the alignment between data augmentation and
+anomaly-generating mechanism is the key to the success of SSAD, and in the lack
+thereof, SSL may even impair accuracy. To the best of our knowledge, this is
+the first meta-analysis on the role of data augmentation in SSAD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Transactions on Machine Learning Research (TMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SACSoN: Scalable Autonomous Control for Social Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01874v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01874v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noriaki Hirose, Dhruv Shah, Ajay Sridhar, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning provides a powerful tool for building socially compliant
+robotic systems that go beyond simple predictive models of human behavior. By
+observing and understanding human interactions from past experiences, learning
+can enable effective social navigation behaviors directly from data. In this
+paper, our goal is to develop methods for training policies for socially
+unobtrusive navigation, such that robots can navigate among humans in ways that
+don't disturb human behavior. We introduce a definition for such behavior based
+on the counterfactual perturbation of the human: if the robot had not intruded
+into the space, would the human have acted in the same way? By minimizing this
+counterfactual perturbation, we can induce robots to behave in ways that do not
+alter the natural behavior of humans in the shared space. Instantiating this
+principle requires training policies to minimize their effect on human
+behavior, and this in turn requires data that allows us to model the behavior
+of humans in the presence of robots. Therefore, our approach is based on two
+key contributions. First, we collect a large dataset where an indoor mobile
+robot interacts with human bystanders. Second, we utilize this dataset to train
+policies that minimize counterfactual perturbation. We provide supplementary
+videos and make publicly available the largest-of-its-kind visual navigation
+dataset on our project page.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 14 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">6</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLIP Brings Better Features to Visual Aesthetics Learners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15640v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15640v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liwu Xu, Jinjin Xu, Yuzhe Yang, Yijie Huang, Yanchun Xie, Yaqian Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of pre-training approaches on a variety of downstream tasks has
+revitalized the field of computer vision. Image aesthetics assessment (IAA) is
+one of the ideal application scenarios for such methods due to subjective and
+expensive labeling procedure. In this work, an unified and flexible two-phase
+\textbf{C}LIP-based \textbf{S}emi-supervised \textbf{K}nowledge
+\textbf{D}istillation paradigm is proposed, namely \textbf{\textit{CSKD}}.
+Specifically, we first integrate and leverage a multi-source unlabeled dataset
+to align rich features between a given visual encoder and an off-the-shelf CLIP
+image encoder via feature alignment loss. Notably, the given visual encoder is
+not limited by size or structure and, once well-trained, it can seamlessly
+serve as a better visual aesthetic learner for both student and teacher. In the
+second phase, the unlabeled data is also utilized in semi-supervised IAA
+learning to further boost student model performance when applied in
+latency-sensitive production scenarios. By analyzing the attention distance and
+entropy before and after feature alignment, we notice an alleviation of feature
+collapse issue, which in turn showcase the necessity of feature alignment
+instead of training directly based on CLIP image encoder. Extensive experiments
+indicate the superiority of CSKD, which achieves state-of-the-art performance
+on multiple widely used IAA benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FleXR: A System Enabling Flexibly Distributed Extended Reality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15574v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15574v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jin Heo, Ketan Bhardwaj, Ada Gavrilovska
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extended reality (XR) applications require computationally demanding
+functionalities with low end-to-end latency and high throughput. To enable XR
+on commodity devices, a number of distributed systems solutions enable
+offloading of XR workloads on remote servers. However, they make a priori
+decisions regarding the offloaded functionalities based on assumptions about
+operating factors, and their benefits are restricted to specific deployment
+contexts. To realize the benefits of offloading in various distributed
+environments, we present a distributed stream processing system, FleXR, which
+is specialized for real-time and interactive workloads and enables flexible
+distributions of XR functionalities. In building FleXR, we identified and
+resolved several issues of presenting XR functionalities as distributed
+pipelines. FleXR provides a framework for flexible distribution of XR pipelines
+while streamlining development and deployment phases. We evaluate FleXR with
+three XR use cases in four different distribution scenarios. In the results,
+the best-case distribution scenario shows up to 50% less end-to-end latency and
+3.9x pipeline throughput compared to alternatives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 11 figures, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Social Media Popularity Prediction with Multiple Post
+  Dependencies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhizhen Zhang, Xiaohui Xie, Mengyu Yang, Ye Tian, Yong Jiang, Yong Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social Media Popularity Prediction has drawn a lot of attention because of
+its profound impact on many different applications, such as recommendation
+systems and multimedia advertising. Despite recent efforts to leverage the
+content of social media posts to improve prediction accuracy, many existing
+models fail to fully exploit the multiple dependencies between posts, which are
+important to comprehensively extract content information from posts. To tackle
+this problem, we propose a novel prediction framework named Dependency-aware
+Sequence Network (DSN) that exploits both intra- and inter-post dependencies.
+For intra-post dependency, DSN adopts a multimodal feature extractor with an
+efficient fine-tuning strategy to obtain task-specific representations from
+images and textual information of posts. For inter-post dependency, DSN uses a
+hierarchical information propagation method to learn category representations
+that could better describe the difference between posts. DSN also exploits
+recurrent networks with a series of gating layers for more flexible local
+temporal processing abilities and multi-head attention for long-term
+dependencies. The experimental results on the Social Media Popularity Dataset
+demonstrate the superiority of our method compared to existing state-of-the-art
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffKendall: A Novel Approach for Few-Shot Learning with Differentiable
+  Kendall's Rank Correlation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaipeng Zheng, Huishuai Zhang, Weiran Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot learning aims to adapt models trained on the base dataset to novel
+tasks where the categories are not seen by the model before. This often leads
+to a relatively uniform distribution of feature values across channels on novel
+classes, posing challenges in determining channel importance for novel tasks.
+Standard few-shot learning methods employ geometric similarity metrics such as
+cosine similarity and negative Euclidean distance to gauge the semantic
+relatedness between two features. However, features with high geometric
+similarities may carry distinct semantics, especially in the context of
+few-shot learning. In this paper, we demonstrate that the importance ranking of
+feature channels is a more reliable indicator for few-shot learning than
+geometric similarity metrics. We observe that replacing the geometric
+similarity metric with Kendall's rank correlation only during inference is able
+to improve the performance of few-shot learning across a wide range of datasets
+with different domains. Furthermore, we propose a carefully designed
+differentiable loss for meta-training to address the non-differentiability
+issue of Kendall's rank correlation. Extensive experiments demonstrate that the
+proposed rank-correlation-based approach substantially enhances few-shot
+learning performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CN-Celeb-AV: A Multi-Genre Audio-Visual <span class="highlight-title">Dataset</span> for Person Recognition <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16049v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16049v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lantian Li, Xiaolou Li, Haoyu Jiang, Chen Chen, Ruihai Hou, Dong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-visual person recognition (AVPR) has received extensive attention.
+However, most datasets used for AVPR research so far are collected in
+constrained environments, and thus cannot reflect the true performance of AVPR
+systems in real-world scenarios. To meet the request for research on AVPR in
+unconstrained conditions, this paper presents a multi-genre AVPR dataset
+collected `in the wild', named CN-Celeb-AV. This dataset contains more than
+419k video segments from 1,136 persons from public media. In particular, we put
+more emphasis on two real-world complexities: (1) data in multiple genres; (2)
+segments with partial information. A comprehensive study was conducted to
+compare CN-Celeb-AV with two popular public AVPR benchmark datasets, and the
+results demonstrated that CN-Celeb-AV is more in line with real-world scenarios
+and can be regarded as a new benchmark dataset for AVPR research. The dataset
+also involves a development set that can be used to boost the performance of
+AVPR systems in real-life situations. The dataset is free for researchers and
+can be downloaded from http://cnceleb.org/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>INTERSPEECH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Multimodal Prediction of Spontaneous Humour: A Novel <span class="highlight-title">Dataset</span> and
+  First Results 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.14272v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.14272v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Christ, Shahin Amiriparian, Alexander Kathan, Niklas Müller, Andreas König, Björn W. Schuller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humour is a substantial element of human affect and cognition. Its automatic
+understanding can facilitate a more naturalistic human-device interaction and
+the humanisation of artificial intelligence. Current methods of humour
+detection are solely based on staged data making them inadequate for
+'real-world' applications. We address this deficiency by introducing the novel
+Passau-Spontaneous Football Coach Humour (Passau-SFCH) dataset, comprising of
+about 11 hours of recordings. The Passau-SFCH dataset is annotated for the
+presence of humour and its dimensions (sentiment and direction) as proposed in
+Martin's Humor Style Questionnaire. We conduct a series of experiments,
+employing pretrained Transformers, convolutional neural networks, and
+expert-designed features. The performance of each modality (text, audio, video)
+for spontaneous humour recognition is analysed and their complementarity is
+investigated. Our findings suggest that for the automatic analysis of humour
+and its sentiment, facial expressions are most promising, while humour
+direction can be best modelled via text-based features. The results reveal
+considerable differences among various subjects, highlighting the individuality
+of humour usage and style. Further, we observe that a decision-level fusion
+yields the best recognition result. Finally, we make our code publicly
+available at https://www.github.com/EIHW/passau-sfch. The Passau-SFCH dataset
+is available upon request.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible (Major Revision)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-07-27T00:00:00Z">2023-07-27</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">49</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Geometric Notion of Causal Probing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15054v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15054v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clément Guerner, Anej Svete, Tianyu Liu, Alexander Warstadt, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models rely on real-valued representations of text to make
+their predictions. These representations contain information learned from the
+data that the model has trained on, including knowledge of linguistic
+properties and forms of demographic bias, e.g., based on gender. A growing body
+of work has considered information about concepts such as these using
+orthogonal projections onto subspaces of the representation space. We
+contribute to this body of work by proposing a formal definition of intrinsic
+information in a subspace of a language model's representation space. We
+propose a counterfactual approach that avoids the failure mode of spurious
+correlations (Kumar et al., 2022) by treating components in the subspace and
+its orthogonal complement independently. We show that our counterfactual notion
+of information in a subspace is optimizing by an causal concept subspace.
+Furthermore, this intervention allows us to attempt concept controlled
+generation by manipulating the value of the conceptual component of a
+representation. Empirically, we find that R-LACE (Ravfogel et al., 2022)
+returns a one-dimensional subspace containing roughly half of total concept
+information under our framework. Our causal controlled intervention shows that,
+for at least one model, the subspace returned by R-LACE can be used to
+manipulate the concept value of the generated word with precision.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Matching Patients to Clinical Trials with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiao Jin, Zifeng Wang, Charalampos S. Floudas, Jimeng Sun, Zhiyong Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical trials are vital in advancing drug development and evidence-based
+medicine, but their success is often hindered by challenges in patient
+recruitment. In this work, we investigate the potential of large language
+models (LLMs) to assist individual patients and referral physicians in
+identifying suitable clinical trials from an extensive selection. Specifically,
+we introduce TrialGPT, a novel architecture employing LLMs to predict
+criterion-level eligibility with detailed explanations, which are then
+aggregated for ranking and excluding candidate clinical trials based on
+free-text patient notes. We evaluate TrialGPT on three publicly available
+cohorts of 184 patients and 18,238 annotated clinical trials. The experimental
+results demonstrate several key findings: First, TrialGPT achieves high
+criterion-level prediction accuracy with faithful explanations. Second, the
+aggregated trial-level TrialGPT scores are highly correlated with expert
+eligibility annotations. Third, these scores prove effective in ranking
+clinical trials and exclude ineligible candidates. Our error analysis suggests
+that current LLMs still make some mistakes due to limited medical knowledge and
+domain-specific context understanding. Nonetheless, we believe the explanatory
+capabilities of LLMs are highly valuable. Future research is warranted on how
+such AI assistants can be integrated into the routine trial matching workflow
+in real-world settings to improve its efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal and Transferable Adversarial Attacks on Aligned Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15043v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15043v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andy Zou, Zifan Wang, J. Zico Kolter, Matt Fredrikson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Because "out-of-the-box" large language models are capable of generating a
+great deal of objectionable content, recent work has focused on aligning these
+models in an attempt to prevent undesirable generation. While there has been
+some success at circumventing these measures -- so-called "jailbreaks" against
+LLMs -- these attacks have required significant human ingenuity and are brittle
+in practice. In this paper, we propose a simple and effective attack method
+that causes aligned language models to generate objectionable behaviors.
+Specifically, our approach finds a suffix that, when attached to a wide range
+of queries for an LLM to produce objectionable content, aims to maximize the
+probability that the model produces an affirmative response (rather than
+refusing to answer). However, instead of relying on manual engineering, our
+approach automatically produces these adversarial suffixes by a combination of
+greedy and gradient-based search techniques, and also improves over past
+automatic prompt generation methods.
+  Surprisingly, we find that the adversarial prompts generated by our approach
+are quite transferable, including to black-box, publicly released LLMs.
+Specifically, we train an adversarial attack suffix on multiple prompts (i.e.,
+queries asking for many different types of objectionable content), as well as
+multiple models (in our case, Vicuna-7B and 13B). When doing so, the resulting
+attack suffix is able to induce objectionable content in the public interfaces
+to ChatGPT, Bard, and Claude, as well as open source LLMs such as LLaMA-2-Chat,
+Pythia, Falcon, and others. In total, this work significantly advances the
+state-of-the-art in adversarial attacks against aligned language models,
+raising important questions about how such systems can be prevented from
+producing objectionable information. Code is available at
+github.com/llm-attacks/llm-attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SuperCLUE: A Comprehensive Chinese Large Language Model Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15020v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15020v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Xu, Anqi Li, Lei Zhu, Hang Xue, Changtai Zhu, Kangkang Zhao, Haonan He, Xuanwei Zhang, Qiyue Kang, Zhenzhong Lan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown the potential to be integrated into
+human daily lives. Therefore, user preference is the most critical criterion
+for assessing LLMs' performance in real-world scenarios. However, existing
+benchmarks mainly focus on measuring models' accuracy using multi-choice
+questions, which limits the understanding of their capabilities in real
+applications. We fill this gap by proposing a comprehensive Chinese benchmark
+SuperCLUE, named after another popular Chinese LLM benchmark CLUE. SuperCLUE
+encompasses three sub-tasks: actual users' queries and ratings derived from an
+LLM battle platform (CArena), open-ended questions with single and
+multiple-turn dialogues (OPEN), and closed-ended questions with the same stems
+as open-ended single-turn ones (CLOSE). Our study shows that accuracy on
+closed-ended questions is insufficient to reflect human preferences achieved on
+open-ended ones. At the same time, they can complement each other to predict
+actual user preferences. We also demonstrate that GPT-4 is a reliable judge to
+automatically evaluate human preferences on open-ended questions in a Chinese
+context. Our benchmark will be released at https://www.CLUEbenchmarks.com
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 12 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gzip versus bag-of-words for text classification with KNN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15002v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15002v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juri Opitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The effectiveness of compression distance in KNN-based text classification
+('gzip') has recently garnered lots of attention. In this note, we show that
+similar or better effectiveness can be achieved with simpler means, and text
+compression may not be necessary. Indeed, we find that a simple 'bag-of-words'
+matching can achieve similar or better accuracy, and is more efficient.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling TransNormer to 175 Billion Parameters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Qin, Dong Li, Weigao Sun, Weixuan Sun, Xuyang Shen, Xiaodong Han, Yunshen Wei, Baohong Lv, Fei Yuan, Xiao Luo, Yu Qiao, Yiran Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present TransNormerLLM, the first linear attention-based Large Language
+Model (LLM) that outperforms conventional softmax attention-based models in
+terms of both accuracy and efficiency. TransNormerLLM evolves from the previous
+linear attention architecture TransNormer by making advanced modifications that
+include positional embedding, linear attention acceleration, gating mechanism,
+tensor normalization, inference acceleration and stabilization. Specifically,
+we use LRPE together with an exponential decay to avoid attention dilution
+issues while allowing the model to retain global interactions between tokens.
+Additionally, we propose Lightning Attention, a cutting-edge technique that
+accelerates linear attention by more than twice in runtime and reduces memory
+usage by a remarkable four times. To further enhance the performance of
+TransNormer, we leverage a gating mechanism to smooth training and a new tensor
+normalization scheme to accelerate the model, resulting in an impressive
+acceleration of over 20%. Furthermore, we have developed a robust inference
+algorithm that ensures numerical stability and consistent inference speed,
+regardless of the sequence length, showcasing superior efficiency during both
+training and inference stages. Scalability is at the heart of our model's
+design, enabling seamless deployment on large-scale clusters and facilitating
+expansion to even more extensive models, all while maintaining outstanding
+performance metrics. Rigorous validation of our model design is achieved
+through a series of comprehensive experiments on our self-collected corpus,
+boasting a size exceeding 6TB and containing over 2 trillion tokens. To ensure
+data quality and relevance, we implement a new self-cleaning strategy to filter
+our collected data. Our pre-trained models will be released to foster community
+advancements in efficient LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report. Yiran Zhong is the corresponding author. Zhen Qin,
+  Dong Li, Weigao Sun, Weixuan Sun, Xuyang Shen contribute equally to this
+  paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Incrementally-Computable Neural Networks: Efficient Inference for
+  Dynamic Inputs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Or Sharir, Anima Anandkumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning often faces the challenge of efficiently processing dynamic
+inputs, such as sensor data or user inputs. For example, an AI writing
+assistant is required to update its suggestions in real time as a document is
+edited. Re-running the model each time is expensive, even with compression
+techniques like knowledge distillation, pruning, or quantization. Instead, we
+take an incremental computing approach, looking to reuse calculations as the
+inputs change. However, the dense connectivity of conventional architectures
+poses a major obstacle to incremental computation, as even minor input changes
+cascade through the network and restrict information reuse. To address this, we
+use vector quantization to discretize intermediate values in the network, which
+filters out noisy and unnecessary modifications to hidden neurons, facilitating
+the reuse of their values. We apply this approach to the transformers
+architecture, creating an efficient incremental inference algorithm with
+complexity proportional to the fraction of the modified inputs. Our experiments
+with adapting the OPT-125M pre-trained language model demonstrate comparable
+accuracy on document classification while requiring 12.1X (median) fewer
+operations for processing sequences of atomic edits.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PanGu-Coder2: Boosting Large Language Models for Code with Ranking
+  Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14936v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14936v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Shen, Jiaxin Zhang, Taihong Chen, Daoguang Zan, Bing Geng, An Fu, Muhan Zeng, Ailun Yu, Jichuan Ji, Jingyang Zhao, Yuenan Guo, Qianxiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models for Code (Code LLM) are flourishing. New and powerful
+models are released on a weekly basis, demonstrating remarkable performance on
+the code generation task. Various approaches have been proposed to boost the
+code generation performance of pre-trained Code LLMs, such as supervised
+fine-tuning, instruction tuning, reinforcement learning, etc. In this paper, we
+propose a novel RRTF (Rank Responses to align Test&Teacher Feedback) framework,
+which can effectively and efficiently boost pre-trained large language models
+for code generation. Under this framework, we present PanGu-Coder2, which
+achieves 62.20% pass@1 on the OpenAI HumanEval benchmark. Furthermore, through
+an extensive evaluation on CoderEval and LeetCode benchmarks, we show that
+PanGu-Coder2 consistently outperforms all previous Code LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ARC-NLP at PAN 2023: Transition-Focused Natural Language Inference for
+  Writing Style Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14913v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14913v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Izzet Emre Kucukkaya, Umitcan Sahin, Cagri Toraman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of multi-author writing style detection aims at finding any
+positions of writing style change in a given text document. We formulate the
+task as a natural language inference problem where two consecutive paragraphs
+are paired. Our approach focuses on transitions between paragraphs while
+truncating input tokens for the task. As backbone models, we employ different
+Transformer-based encoders with warmup phase during training. We submit the
+model version that outperforms baselines and other proposed model versions in
+our experiments. For the easy and medium setups, we submit transition-focused
+natural language inference based on DeBERTa with warmup training, and the same
+model without transition for the hard setup.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by PAN at CLEF 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ARC-NLP at PAN 2023: Hierarchical Long Text Classification for Trigger
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14912v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14912v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Umitcan Sahin, Izzet Emre Kucukkaya, Cagri Toraman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fanfiction, a popular form of creative writing set within established
+fictional universes, has gained a substantial online following. However,
+ensuring the well-being and safety of participants has become a critical
+concern in this community. The detection of triggering content, material that
+may cause emotional distress or trauma to readers, poses a significant
+challenge. In this paper, we describe our approach for the Trigger Detection
+shared task at PAN CLEF 2023, where we want to detect multiple triggering
+content in a given Fanfiction document. For this, we build a hierarchical model
+that uses recurrence over Transformer-based language models. In our approach,
+we first split long documents into smaller sized segments and use them to
+fine-tune a Transformer model. Then, we extract feature embeddings from the
+fine-tuned Transformer model, which are used as input in the training of
+multiple LSTM models for trigger detection in a multi-label setting. Our model
+achieves an F1-macro score of 0.372 and F1-micro score of 0.736 on the
+validation set, which are higher than the baseline results shared at PAN CLEF
+2023.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by PAN at CLEF 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retrieval-based Text Selection for Addressing Class-Imbalanced Data in
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14899v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14899v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sareh Ahmadi, Aditya Shah, Edward Fox
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the problem of selecting of a set of texts for
+annotation in text classification using retrieval methods when there are limits
+on the number of annotations due to constraints on human resources. An
+additional challenge addressed is dealing with binary categories that have a
+small number of positive instances, reflecting severe class imbalance. In our
+situation, where annotation occurs over a long time period, the selection of
+texts to be annotated can be made in batches, with previous annotations guiding
+the choice of the next set. To address these challenges, the paper proposes
+leveraging SHAP to construct a quality set of queries for Elasticsearch and
+semantic search, to try to identify optimal sets of texts for annotation that
+will help with class imbalance. The approach is tested on sets of cue texts
+describing possible future events, constructed by participants involved in
+studies aimed to help with the management of obesity and diabetes. We introduce
+an effective method for selecting a small set of texts for annotation and
+building high-quality classifiers. We integrate vector search, semantic search,
+and machine learning classifiers to yield a good solution. Our experiments
+demonstrate improved F1 scores for the minority classes in binary
+classification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MESED: A Multi-modal Entity Set Expansion <span class="highlight-title">Dataset</span> with Fine-grained
+  Semantic Classes and Hard Negative Entities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14878v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14878v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangning Li, Tingwei Lu, Yinghui Li, Tianyu Yu, Shulin Huang, Hai-Tao Zheng, Rui Zhang, Jun Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Entity Set Expansion (ESE) task aims to expand a handful of seed entities
+with new entities belonging to the same semantic class. Conventional ESE
+methods are based on mono-modality (i.e., literal modality), which struggle to
+deal with complex entities in the real world such as: (1) Negative entities
+with fine-grained semantic differences. (2) Synonymous entities. (3) Polysemous
+entities. (4) Long-tailed entities. These challenges prompt us to propose
+Multi-modal Entity Set Expansion (MESE), where models integrate information
+from multiple modalities to represent entities. Intuitively, the benefits of
+multi-modal information for ESE are threefold: (1) Different modalities can
+provide complementary information. (2) Multi-modal information provides a
+unified signal via common visual properties for the same semantic class or
+entity. (3) Multi-modal information offers robust alignment signal for
+synonymous entities. To assess the performance of model in MESE and facilitate
+further research, we constructed the MESED dataset which is the first
+multi-modal dataset for ESE with large-scale and elaborate manual calibration.
+A powerful multi-modal model MultiExpan is proposed which is pre-trained on
+four multimodal pre-training tasks. The extensive experiments and analyses on
+MESED demonstrate the high quality of the dataset and the effectiveness of our
+MultiExpan, as well as pointing the direction for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting the Potential of Seq2Seq Models as Robust Few-Shot Learners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14856v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14856v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihyeon Lee, Dain Kim, Doohae Jung, Boseop Kim, Kyoung-Woon On
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning, which offers substantial advantages over fine-tuning, is
+predominantly observed in decoder-only models, while encoder-decoder (i.e.,
+seq2seq) models excel in methods that rely on weight updates. Recently, a few
+studies have demonstrated the feasibility of few-shot learning with seq2seq
+models; however, this has been limited to tasks that align well with the
+seq2seq architecture, such as summarization and translation. Inspired by these
+initial studies, we provide a first-ever extensive experiment comparing the
+in-context few-shot learning capabilities of decoder-only and encoder-decoder
+models on a broad range of tasks. Furthermore, we propose two methods to more
+effectively elicit in-context learning ability in seq2seq models:
+objective-aligned prompting and a fusion-based approach. Remarkably, our
+approach outperforms a decoder-only model that is six times larger and exhibits
+significant performance improvements compared to conventional seq2seq models
+across a variety of settings. We posit that, with the right configuration and
+prompt design, seq2seq models can be highly effective few-shot learners for a
+wide spectrum of applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Arc<span class="highlight-title">GPT</span>: A Large Language Model Tailored for Real-world Archival
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14852v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14852v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shitou Zhang, Jingrui Hou, Siyuan Peng, Zuchao Li, Qibiao Hu, Ping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Archives play a crucial role in preserving information and knowledge, and the
+exponential growth of such data necessitates efficient and automated tools for
+managing and utilizing archive information resources. Archival applications
+involve managing massive data that are challenging to process and analyze.
+Although LLMs have made remarkable progress in diverse domains, there are no
+publicly available archives tailored LLM. Addressing this gap, we introduce
+ArcGPT, to our knowledge, the first general-purpose LLM tailored to the
+archival field. To enhance model performance on real-world archival tasks,
+ArcGPT has been pre-trained on massive and extensive archival domain data.
+Alongside ArcGPT, we release AMBLE, a benchmark comprising four real-world
+archival tasks. Evaluation on AMBLE shows that ArcGPT outperforms existing
+state-of-the-art models, marking a substantial step forward in effective
+archival data management. Ultimately, ArcGPT aims to better serve the archival
+community, aiding archivists in their crucial role of preserving and harnessing
+our collective information and knowledge.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Turkish Native Language Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14850v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14850v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmet Yavuz Uluslu, Gerold Schneider
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present the first application of Native Language
+Identification (NLI) for the Turkish language. NLI involves predicting the
+writer's first language by analysing their writing in different languages.
+While most NLI research has focused on English, our study extends its scope to
+Turkish. We used the recently constructed Turkish Learner Corpus and employed a
+combination of three syntactic features (CFG production rules, part-of-speech
+n-grams and function words) with L2 texts to demonstrate their effectiveness in
+this task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What Makes a Good Paraphrase: Do Automated Evaluations Work? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14818v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14818v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anna Moskvina, Bhushan Kotnis, Chris Catacata, Michael Janz, Nasrin Saef
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Paraphrasing is the task of expressing an essential idea or meaning in
+different words. But how different should the words be in order to be
+considered an acceptable paraphrase? And can we exclusively use automated
+metrics to evaluate the quality of a paraphrase? We attempt to answer these
+questions by conducting experiments on a German data set and performing
+automatic and expert linguistic evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended Abstract for Konvens2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Models of reference production: How do they withstand the test of time? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14817v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14817v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fahime Same, Guanyi Chen, Kees van Deemter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, many NLP studies have focused solely on performance
+improvement. In this work, we focus on the linguistic and scientific aspects of
+NLP. We use the task of generating referring expressions in context
+(REG-in-context) as a case study and start our analysis from GREC, a
+comprehensive set of shared tasks in English that addressed this topic over a
+decade ago. We ask what the performance of models would be if we assessed them
+(1) on more realistic datasets, and (2) using more advanced methods. We test
+the models using different evaluation metrics and feature selection
+experiments. We conclude that GREC can no longer be regarded as offering a
+reliable assessment of models' ability to mimic human reference production,
+because the results are highly impacted by the choice of corpus and evaluation
+metrics. Our results also suggest that pre-trained language models are less
+dependent on the choice of corpus than classic Machine Learning models, and
+therefore make more robust class predictions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INLG 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Aspect-Based Sentiment with End-to-End Semantic Role Labeling
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14785v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14785v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pavel Přibáň, Ondřej Pražák
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a series of approaches aimed at enhancing the performance
+of Aspect-Based Sentiment Analysis (ABSA) by utilizing extracted semantic
+information from a Semantic Role Labeling (SRL) model. We propose a novel
+end-to-end Semantic Role Labeling model that effectively captures most of the
+structured semantic information within the Transformer hidden state. We believe
+that this end-to-end model is well-suited for our newly proposed models that
+incorporate semantic information. We evaluate the proposed models in two
+languages, English and Czech, employing ELECTRA-small models. Our combined
+models improve ABSA performance in both languages. Moreover, we achieved new
+state-of-the-art results on the Czech ABSA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to RANLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emotion4MIDI: a Lyrics-based Emotion-Labeled Symbolic Music <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serkan Sulun, Pedro Oliveira, Paula Viana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new large-scale emotion-labeled symbolic music dataset
+consisting of 12k MIDI songs. To create this dataset, we first trained emotion
+classification models on the GoEmotions dataset, achieving state-of-the-art
+results with a model half the size of the baseline. We then applied these
+models to lyrics from two large-scale MIDI datasets. Our dataset covers a wide
+range of fine-grained emotions, providing a valuable resource to explore the
+connection between music and emotions and, especially, to develop models that
+can generate music based on specific emotions. Our code for inference, trained
+models, and datasets are available online.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 22nd EPIA Conference on Artificial Intelligence (2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Turning Whisper into Real-Time Transcription System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Macháček, Raj Dabre, Ondřej Bojar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whisper is one of the recent state-of-the-art multilingual speech recognition
+and translation models, however, it is not designed for real time
+transcription. In this paper, we build on top of Whisper and create
+Whisper-Streaming, an implementation of real-time speech transcription and
+translation of Whisper-like models. Whisper-Streaming uses local agreement
+policy with self-adaptive latency to enable streaming transcription. We show
+that Whisper-Streaming achieves high quality and 3.3 seconds latency on
+unsegmented long-form speech transcription test set, and we demonstrate its
+robustness and practical usability as a component in live transcription service
+at a multilingual conference.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>system demonstration pre-print</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Generative Models for Graph-to-Text Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuzhou Yuan, Michael Färber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have been widely employed for graph-to-text
+generation tasks. However, the process of finetuning LLMs requires significant
+training resources and annotation work. In this paper, we explore the
+capability of generative models to generate descriptive text from graph data in
+a zero-shot setting. Specifically, we evaluate GPT-3 and ChatGPT on two
+graph-to-text datasets and compare their performance with that of finetuned LLM
+models such as T5 and BART. Our results demonstrate that generative models are
+capable of generating fluent and coherent text, achieving BLEU scores of 10.57
+and 11.08 for the AGENDA and WebNLG datasets, respectively. However, our error
+analysis reveals that generative models still struggle with understanding the
+semantic relations between entities, and they also tend to generate text with
+hallucinations or irrelevant information. As a part of error analysis, we
+utilize BERT to detect machine-generated text and achieve high macro-F1 scores.
+We have made the text generated by generative models publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as short paper in RANLP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Natural Language Inference in Arabic using <span class="highlight-title">Transformer</span> Models
+  and Linguistically Informed <span class="highlight-title">Pre-Train</span>ing <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Majd Saad Al Deen, Maren Pielka, Jörn Hees, Bouthaina Soulef Abdou, Rafet Sifa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the classification of Arabic text data in the field of
+Natural Language Processing (NLP), with a particular focus on Natural Language
+Inference (NLI) and Contradiction Detection (CD). Arabic is considered a
+resource-poor language, meaning that there are few data sets available, which
+leads to limited availability of NLP methods. To overcome this limitation, we
+create a dedicated data set from publicly available resources. Subsequently,
+transformer-based machine learning models are being trained and evaluated. We
+find that a language-specific model (AraBERT) performs competitively with
+state-of-the-art multilingual approaches, when we apply linguistically informed
+pre-training methods such as Named Entity Recognition (NER). To our knowledge,
+this is the first large-scale evaluation for this task in Arabic, as well as
+the first application of multi-task pre-training in this context.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to IEEE SSCI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Metric-Based In-context Learning: A Case Study in Text Simplification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14632v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14632v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Subha Vadlamannati, Gözde Gül Şahin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning (ICL) for large language models has proven to be a
+powerful approach for many natural language processing tasks. However,
+determining the best method to select examples for ICL is nontrivial as the
+results can vary greatly depending on the quality, quantity, and order of
+examples used. In this paper, we conduct a case study on text simplification
+(TS) to investigate how to select the best and most robust examples for ICL. We
+propose Metric-Based in-context Learning (MBL) method that utilizes commonly
+used TS metrics such as SARI, compression ratio, and BERT-Precision for
+selection. Through an extensive set of experiments with various-sized GPT
+models on standard TS benchmarks such as TurkCorpus and ASSET, we show that
+examples selected by the top SARI scores perform the best on larger models such
+as GPT-175B, while the compression ratio generally performs better on smaller
+models such as GPT-13B and GPT-6.7B. Furthermore, we demonstrate that MBL is
+generally robust to example orderings and out-of-domain test sets, and
+outperforms strong baselines and state-of-the-art finetuned language models.
+Finally, we show that the behaviour of large GPT models can be implicitly
+controlled by the chosen metric. Our research provides a new framework for
+selecting examples in ICL, and demonstrates its effectiveness in text
+simplification tasks, breaking new ground for more accurate and efficient NLG
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INLG</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open Problems and Fundamental Limitations of Reinforcement Learning from
+  Human Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephen Casper, Xander Davies, Claudia Shi, Thomas Krendl Gilbert, Jérémy Scheurer, Javier Rando, Rachel Freedman, Tomasz Korbak, David Lindner, Pedro Freire, Tony Wang, Samuel Marks, Charbel-Raphaël Segerie, Micah Carroll, Andi Peng, Phillip Christoffersen, Mehul Damani, Stewart Slocum, Usman Anwar, Anand Siththaranjan, Max Nadeau, Eric J. Michaud, Jacob Pfau, Dmitrii Krasheninnikov, Xin Chen, Lauro Langosco, Peter Hase, Erdem Bıyık, Anca Dragan, David Krueger, Dorsa Sadigh, Dylan Hadfield-Menell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning from human feedback (RLHF) is a technique for training
+AI systems to align with human goals. RLHF has emerged as the central method
+used to finetune state-of-the-art large language models (LLMs). Despite this
+popularity, there has been relatively little public work systematizing its
+flaws. In this paper, we (1) survey open problems and fundamental limitations
+of RLHF and related methods; (2) overview techniques to understand, improve,
+and complement RLHF in practice; and (3) propose auditing and disclosure
+standards to improve societal oversight of RLHF systems. Our work emphasizes
+the limitations of RLHF and highlights the importance of a multi-faceted
+approach to the development of safer AI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>Styler: <span class="highlight-title">Prompt</span>-driven Style Generation for Source-free Domain
+  Generalization <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhyeong Cho, Gilhyun Nam, Sungyeon Kim, Hunmin Yang, Suha Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a joint vision-language space, a text feature (e.g., from "a photo of a
+dog") could effectively represent its relevant image features (e.g., from dog
+photos). Inspired by this, we propose PromptStyler which simulates various
+distribution shifts in the joint space by synthesizing diverse styles via
+prompts without using any images to deal with source-free domain
+generalization. Our method learns to generate a variety of style features (from
+"a S* style of a") via learnable style word vectors for pseudo-words S*. To
+ensure that learned styles do not distort content information, we force
+style-content features (from "a S* style of a [class]") to be located nearby
+their corresponding content features (from "[class]") in the joint
+vision-language space. After learning style word vectors, we train a linear
+classifier using synthesized style-content features. PromptStyler achieves the
+state of the art on PACS, VLCS, OfficeHome and DomainNet, although it does not
+require any images and takes just ~30 minutes for training using a single GPU.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023, Project Page: https://promptstyler.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ f-Divergence Minimization for Sequence-Level Knowledge Distillation <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15190v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15190v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqiao Wen, Zichao Li, Wenyu Du, Lili Mou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation (KD) is the process of transferring knowledge from a
+large model to a small one. It has gained increasing attention in the natural
+language processing community, driven by the demands of compressing
+ever-growing language models. In this work, we propose an f-DISTILL framework,
+which formulates sequence-level knowledge distillation as minimizing a
+generalized f-divergence function. We propose four distilling variants under
+our framework and show that existing SeqKD and ENGINE approaches are
+approximations of our f-DISTILL methods. We further derive step-wise
+decomposition for our f-DISTILL, reducing intractable sequence-level divergence
+to word-level losses that can be computed in a tractable manner. Experiments
+across four datasets show that our methods outperform existing KD approaches,
+and that our symmetric distilling losses can better force the student to learn
+from the teacher distribution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RCT Rejection Sampling for Causal Estimation Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15176v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15176v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katherine A. Keith, Sergey Feldman, David Jurgens, Jonathan Bragg, Rohit Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Confounding is a significant obstacle to unbiased estimation of causal
+effects from observational data. For settings with high-dimensional covariates
+-- such as text data, genomics, or the behavioral social sciences --
+researchers have proposed methods to adjust for confounding by adapting machine
+learning methods to the goal of causal estimation. However, empirical
+evaluation of these adjustment methods has been challenging and limited. In
+this work, we build on a promising empirical evaluation strategy that
+simplifies evaluation design and uses real data: subsampling randomized
+controlled trials (RCTs) to create confounded observational datasets while
+using the average causal effects from the RCTs as ground-truth. We contribute a
+new sampling algorithm, which we call RCT rejection sampling, and provide
+theoretical guarantees that causal identification holds in the observational
+data to allow for valid comparisons to the ground-truth RCT. Using synthetic
+data, we show our algorithm indeed results in low bias when oracle estimators
+are evaluated on the confounded samples, which is not always the case for a
+previously proposed algorithm. In addition to this identification result, we
+highlight several finite data considerations for evaluation designers who plan
+to use RCT rejection sampling on their own datasets. As a proof of concept, we
+implement an example evaluation pipeline and walk through these finite data
+considerations with a novel, real-world RCT -- which we release publicly --
+consisting of approximately 70k observations and text data as high-dimensional
+covariates. Together, these contributions build towards a broader agenda of
+improved empirical evaluation for causal estimation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and data at https://github.com/kakeith/rct_rejection_sampling</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VISU at WASSA 2023 Shared Task: Detecting Emotions in Reaction to News
+  Stories Leveraging <span class="highlight-title">BERT</span> and Stacked Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15164v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15164v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivek Kumar, Sushmita Singh, Prayag Tiwari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our system, VISU, participated in the WASSA 2023 Shared Task (3) of Emotion
+Classification from essays written in reaction to news articles. Emotion
+detection from complex dialogues is challenging and often requires
+context/domain understanding. Therefore in this research, we have focused on
+developing deep learning (DL) models using the combination of word embedding
+representations with tailored prepossessing strategies to capture the nuances
+of emotions expressed. Our experiments used static and contextual embeddings
+(individual and stacked) with Bidirectional Long short-term memory (BiLSTM) and
+Transformer based models. We occupied rank tenth in the emotion detection task
+by scoring a Macro F1-Score of 0.2717, validating the efficacy of our
+implemented approaches for small and imbalanced datasets with mixed categories
+of target emotions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cascaded Cross-Modal <span class="highlight-title">Transformer</span> for Request and Complaint Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolae-Catalin Ristea, Radu Tudor Ionescu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel cascaded cross-modal transformer (CCMT) that combines
+speech and text transcripts to detect customer requests and complaints in phone
+conversations. Our approach leverages a multimodal paradigm by transcribing the
+speech using automatic speech recognition (ASR) models and translating the
+transcripts into different languages. Subsequently, we combine
+language-specific BERT-based models with Wav2Vec2.0 audio features in a novel
+cascaded cross-attention transformer model. We apply our system to the Requests
+Sub-Challenge of the ACM Multimedia 2023 Computational Paralinguistics
+Challenge, reaching unweighted average recalls (UAR) of 65.41% and 85.87% for
+the complaint and request classes, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mining Clues from Incomplete Utterance: A Query-enhanced Network for
+  Incomplete Utterance Rewriting <span class="chip">NAACL 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00866v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00866v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuzheng Si, Shuang Zeng, Baobao Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incomplete utterance rewriting has recently raised wide attention. However,
+previous works do not consider the semantic structural information between
+incomplete utterance and rewritten utterance or model the semantic structure
+implicitly and insufficiently. To address this problem, we propose a
+QUEry-Enhanced Network (QUEEN). Firstly, our proposed query template explicitly
+brings guided semantic structural knowledge between the incomplete utterance
+and the rewritten utterance making model perceive where to refer back to or
+recover omitted tokens. Then, we adopt a fast and effective edit operation
+scoring network to model the relation between two tokens. Benefiting from
+proposed query template and the well-designed edit operation scoring network,
+QUEEN achieves state-of-the-art performance on several public datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NAACL 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Explainable and Language-Agnostic LLMs: Symbolic Reverse
+  Engineering of Language at Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00017v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00017v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Walid S. Saba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved a milestone that undenia-bly
+changed many held beliefs in artificial intelligence (AI). However, there
+remains many limitations of these LLMs when it comes to true language
+understanding, limitations that are a byproduct of the under-lying architecture
+of deep neural networks. Moreover, and due to their subsymbolic nature,
+whatever knowledge these models acquire about how language works will always be
+buried in billions of microfeatures (weights), none of which is meaningful on
+its own, making such models hopelessly unexplainable. To address these
+limitations, we suggest com-bining the strength of symbolic representations
+with what we believe to be the key to the success of LLMs, namely a successful
+bottom-up re-verse engineering of language at scale. As such we argue for a
+bottom-up reverse engineering of language in a symbolic setting. Hints on what
+this project amounts to have been suggested by several authors, and we discuss
+in some detail here how this project could be accomplished.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Draft, preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Emotion Experiencer Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16731v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16731v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Wegge, Roman Klinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The most prominent subtask in emotion analysis is emotion classification; to
+assign a category to a textual unit, for instance a social media post. Many
+research questions from the social sciences do, however, not only require the
+detection of the emotion of an author of a post but to understand who is
+ascribed an emotion in text. This task is tackled by emotion role labeling
+which aims at extracting who is described in text to experience an emotion,
+why, and towards whom. This could, however, be considered overly sophisticated
+if the main question to answer is who feels which emotion. A targeted approach
+for such setup is to classify emotion experiencer mentions (aka "emoters")
+regarding the emotion they presumably perceive. This task is similar to named
+entity recognition of person names with the difference that not every mentioned
+entity name is an emoter. While, very recently, data with emoter annotations
+has been made available, no experiments have yet been performed to detect such
+mentions. With this paper, we provide baseline experiments to understand how
+challenging the task is. We further evaluate the impact on experiencer-specific
+emotion categorization and appraisal detection in a pipeline, when gold
+mentions are not available. We show that experiencer detection in text is a
+challenging task, with a precision of .82 and a recall of .56 (F1 =.66). These
+results motivate future work of jointly modeling emoter spans and
+emotion/appraisal predictions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to the CPSS workshop at KONVENS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fraunhofer SIT at CheckThat! 2023: Mixing Single-Modal Classifiers to
+  Estimate the Check-Worthiness of Multi-Modal Tweets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00610v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00610v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raphael Frick, Inna Vogel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The option of sharing images, videos and audio files on social media opens up
+new possibilities for distinguishing between false information and fake news on
+the Internet. Due to the vast amount of data shared every second on social
+media, not all data can be verified by a computer or a human expert. Here, a
+check-worthiness analysis can be used as a first step in the fact-checking
+pipeline and as a filtering mechanism to improve efficiency. This paper
+proposes a novel way of detecting the check-worthiness in multi-modal tweets.
+It takes advantage of two classifiers, each trained on a single modality. For
+image data, extracting the embedded text with an OCR analysis has shown to
+perform best. By combining the two classifiers, the proposed solution was able
+to place first in the CheckThat! 2023 Task 1A with an F1 score of 0.7297
+achieved on the private test set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fraunhofer SIT at CheckThat! 2023: Tackling Classification Uncertainty
+  Using Model Souping on the Example of Check-Worthiness Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02377v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02377v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raphael Frick, Inna Vogel, Jeong-Eun Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes the second-placed approach developed by the Fraunhofer
+SIT team in the CLEF-2023 CheckThat! lab Task 1B for English. Given a text
+snippet from a political debate, the aim of this task is to determine whether
+it should be assessed for check-worthiness. Detecting check-worthy statements
+aims to facilitate manual fact-checking efforts by prioritizing the claims that
+fact-checkers should consider first. It can also be considered as primary step
+of a fact-checking system. Our best-performing method took advantage of an
+ensemble classification scheme centered on Model Souping. When applied to the
+English data set, our submitted model achieved an overall F1 score of 0.878 and
+was ranked as the second-best model in the competition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating Large Language Models for Radiology Natural Language
+  Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13693v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13693v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengliang Liu, Tianyang Zhong, Yiwei Li, Yutong Zhang, Yi Pan, Zihao Zhao, Peixin Dong, Chao Cao, Yuxiao Liu, Peng Shu, Yaonai Wei, Zihao Wu, Chong Ma, Jiaqi Wang, Sheng Wang, Mengyue Zhou, Zuowei Jiang, Chunlin Li, Jason Holmes, Shaochen Xu, Lu Zhang, Haixing Dai, Kai Zhang, Lin Zhao, Yuanhao Chen, Xu Liu, Peilong Wang, Pingkun Yan, Jun Liu, Bao Ge, Lichao Sun, Dajiang Zhu, Xiang Li, Wei Liu, Xiaoyan Cai, Xintao Hu, Xi Jiang, Shu Zhang, Xin Zhang, Tuo Zhang, Shijie Zhao, Quanzheng Li, Hongtu Zhu, Dinggang Shen, Tianming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of large language models (LLMs) has marked a pivotal shift in the
+field of natural language processing (NLP). LLMs have revolutionized a
+multitude of domains, and they have made a significant impact in the medical
+field. Large language models are now more abundant than ever, and many of these
+models exhibit bilingual capabilities, proficient in both English and Chinese.
+However, a comprehensive evaluation of these models remains to be conducted.
+This lack of assessment is especially apparent within the context of radiology
+NLP. This study seeks to bridge this gap by critically evaluating thirty two
+LLMs in interpreting radiology reports, a crucial component of radiology NLP.
+Specifically, the ability to derive impressions from radiologic findings is
+assessed. The outcomes of this evaluation provide key insights into the
+performance, strengths, and weaknesses of these LLMs, informing their practical
+applications within the medical domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Empower Your Model with Longer and Better Context Comprehension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13365v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13365v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifei Gao, Lei Wang, Jun Fang, Longhua Hu, Jun Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, with the emergence of numerous Large Language Models (LLMs), the
+implementation of AI has entered a new era. Irrespective of these models' own
+capacity and structure, there is a growing demand for LLMs to possess enhanced
+comprehension of longer and more complex contexts with relatively smaller
+sizes. Models often encounter an upper limit when processing sequences of
+sentences that extend beyond their comprehension capacity and result in
+off-topic or even chaotic responses. While several recent works attempt to
+address this issue in various ways, they rarely focus on "why models are unable
+to compensate or strengthen their capabilities on their own". In this paper, we
+thoroughly investigate the nature of information transfer within LLMs and
+propose a novel technique called Attention Transition. This technique empowers
+models to achieve longer and better context comprehension with minimal
+additional training or impact on generation fluency. Our experiments are
+conducted on the challenging XSum dataset using LLaMa-7b model with context
+token length ranging from 800 to 1900. Results demonstrate that we achieve
+substantial improvements compared with the original generation results
+evaluated by GPT4.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>LLM for long context comprehension</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Confidence-based Partial Label Learning Model for Crowd-Annotated
+  Named Entity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12485v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12485v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Limao Xiong, Jie Zhou, Qunxi Zhu, Xiao Wang, Yuanbin Wu, Qi Zhang, Tao Gui, Xuanjing Huang, Jin Ma, Ying Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing models for named entity recognition (NER) are mainly based on
+large-scale labeled datasets, which always obtain using crowdsourcing. However,
+it is hard to obtain a unified and correct label via majority voting from
+multiple annotators for NER due to the large labeling space and complexity of
+this task. To address this problem, we aim to utilize the original
+multi-annotator labels directly. Particularly, we propose a Confidence-based
+Partial Label Learning (CPLL) method to integrate the prior confidence (given
+by annotators) and posterior confidences (learned by models) for
+crowd-annotated NER. This model learns a token- and content-dependent
+confidence via an Expectation-Maximization (EM) algorithm by minimizing
+empirical risk. The true posterior estimator and confidence estimator perform
+iteratively to update the true posterior and confidence respectively. We
+conduct extensive experimental results on both real-world and synthetic
+datasets, which show that our model can improve performance effectively
+compared with strong baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ThoughtSource: A central hub for large language model reasoning data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11596v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11596v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Ott, Konstantin Hebenstreit, Valentin Liévin, Christoffer Egeberg Hother, Milad Moradi, Maximilian Mayrhauser, Robert Praas, Ole Winther, Matthias Samwald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) such as GPT-4 have recently demonstrated
+impressive results across a wide range of tasks. LLMs are still limited,
+however, in that they frequently fail at complex reasoning, their reasoning
+processes are opaque, they are prone to 'hallucinate' facts, and there are
+concerns about their underlying biases. Letting models verbalize reasoning
+steps as natural language, a technique known as chain-of-thought prompting, has
+recently been proposed as a way to address some of these issues. Here we
+present ThoughtSource, a meta-dataset and software library for chain-of-thought
+(CoT) reasoning. The goal of ThoughtSource is to improve future artificial
+intelligence systems by facilitating qualitative understanding of CoTs,
+enabling empirical evaluations, and providing training data. This first release
+of ThoughtSource integrates seven scientific/medical, three general-domain and
+five math word question answering datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revision: added datasets, formatting</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-Context Retrieval-Augmented Language Models <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00083v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00083v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ori Ram, Yoav Levine, Itay Dalmedigos, Dor Muhlgay, Amnon Shashua, Kevin Leyton-Brown, Yoav Shoham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Language Modeling (RALM) methods, which condition a
+language model (LM) on relevant documents from a grounding corpus during
+generation, were shown to significantly improve language modeling performance.
+In addition, they can mitigate the problem of factually inaccurate text
+generation and provide natural source attribution mechanism. Existing RALM
+approaches focus on modifying the LM architecture in order to facilitate the
+incorporation of external information, significantly complicating deployment.
+This paper considers a simple alternative, which we dub In-Context RALM:
+leaving the LM architecture unchanged and prepending grounding documents to the
+input, without any further training of the LM. We show that In-Context RALM
+that builds on off-the-shelf general purpose retrievers provides surprisingly
+large LM gains across model sizes and diverse corpora. We also demonstrate that
+the document retrieval and ranking mechanism can be specialized to the RALM
+setting to further boost performance. We conclude that In-Context RALM has
+considerable potential to increase the prevalence of LM grounding, particularly
+in settings where a pretrained LM must be used without modification or even via
+API access.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Transactions of the Association for
+  Computational Linguistics (TACL). pre-MIT Press publication version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models Struggle to Learn Long-Tail Knowledge <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08411v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08411v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Kandpal, Haikang Deng, Adam Roberts, Eric Wallace, Colin Raffel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Internet contains a wealth of knowledge -- from the birthdays of
+historical figures to tutorials on how to code -- all of which may be learned
+by language models. However, while certain pieces of information are ubiquitous
+on the web, others appear extremely rarely. In this paper, we study the
+relationship between the knowledge memorized by large language models and the
+information in pre-training datasets scraped from the web. In particular, we
+show that a language model's ability to answer a fact-based question relates to
+how many documents associated with that question were seen during pre-training.
+We identify these relevant documents by entity linking pre-training datasets
+and counting documents that contain the same entities as a given
+question-answer pair. Our results demonstrate strong correlational and causal
+relationships between accuracy and relevant document count for numerous
+question answering datasets (e.g., TriviaQA), pre-training corpora (e.g.,
+ROOTS), and model sizes (e.g., 176B parameters). Moreover, while larger models
+are better at learning long-tail knowledge, we estimate that today's models
+must be scaled by many orders of magnitude to reach competitive QA performance
+on questions with little support in the pre-training data. Finally, we show
+that retrieval-augmentation can reduce the dependence on relevant pre-training
+information, presenting a promising approach for capturing the long-tail.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023 Camera Ready Version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RRAML: Reinforced Retrieval Augmented Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12798v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12798v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Bacciu, Florin Cuconasu, Federico Siciliano, Fabrizio Silvestri, Nicola Tonellotto, Giovanni Trappolini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large language models (LLMs) has revolutionized machine
+learning and related fields, showcasing remarkable abilities in comprehending,
+generating, and manipulating human language. However, their conventional usage
+through API-based text prompt submissions imposes certain limitations in terms
+of context constraints and external source availability. To address these
+challenges, we propose a novel framework called Reinforced Retrieval Augmented
+Machine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs
+with supporting information retrieved by a purpose-built retriever from a vast
+user-provided database. By leveraging recent advancements in reinforcement
+learning, our method effectively addresses several critical challenges.
+Firstly, it circumvents the need for accessing LLM gradients. Secondly, our
+method alleviates the burden of retraining LLMs for specific tasks, as it is
+often impractical or impossible due to restricted access to the model and the
+computational intensity involved. Additionally we seamlessly link the
+retriever's task with the reasoner, mitigating hallucinations and reducing
+irrelevant, and potentially damaging retrieved documents. We believe that the
+research agenda outlined in this paper has the potential to profoundly impact
+the field of AI, democratizing access to and utilization of LLMs for a wide
+range of entities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentiable Subset Pruning of <span class="highlight-title">Transformer</span> Heads <span class="chip">ACL 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.04657v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.04657v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaoda Li, Ryan Cotterell, Mrinmaya Sachan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-head attention, a collection of several attention mechanisms that
+independently attend to different parts of the input, is the key ingredient in
+the Transformer. Recent work has shown, however, that a large proportion of the
+heads in a Transformer's multi-head attention mechanism can be safely pruned
+away without significantly harming the performance of the model; such pruning
+leads to models that are noticeably smaller and faster in practice. Our work
+introduces a new head pruning technique that we term differentiable subset
+pruning. Intuitively, our method learns per-head importance variables and then
+enforces a user-specified hard constraint on the number of unpruned heads. The
+importance variables are learned via stochastic gradient descent. We conduct
+experiments on natural language inference and machine translation; we show that
+differentiable subset pruning performs comparably or better than previous works
+while offering precise control of the sparsity level.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TACL 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupling Knowledge from Memorization: Retrieval-augmented <span class="highlight-title">Prompt</span>
+  Learning <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.14704v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.14704v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Chen, Lei Li, Ningyu Zhang, Xiaozhuan Liang, Shumin Deng, Chuanqi Tan, Fei Huang, Luo Si, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt learning approaches have made waves in natural language processing by
+inducing better few-shot performance while they still follow a parametric-based
+learning paradigm; the oblivion and rote memorization problems in learning may
+encounter unstable generalization issues. Specifically, vanilla prompt learning
+may struggle to utilize atypical instances by rote during fully-supervised
+training or overfit shallow patterns with low-shot data. To alleviate such
+limitations, we develop RetroPrompt with the motivation of decoupling knowledge
+from memorization to help the model strike a balance between generalization and
+memorization. In contrast with vanilla prompt learning, RetroPrompt constructs
+an open-book knowledge-store from training instances and implements a retrieval
+mechanism during the process of input, training and inference, thus equipping
+the model with the ability to retrieve related contexts from the training
+corpus as cues for enhancement. Extensive experiments demonstrate that
+RetroPrompt can obtain better performance in both few-shot and zero-shot
+settings. Besides, we further illustrate that our proposed RetroPrompt can
+yield better generalization abilities with new datasets. Detailed analysis of
+memorization indeed reveals RetroPrompt can reduce the reliance of language
+models on memorization; thus, improving generalization for downstream tasks.
+Code is available in
+https://github.com/zjunlp/PromptKG/tree/main/research/RetroPrompt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2022 (Spotlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Say Goodbye to RNN-T Loss: A Novel CIF-based Transducer Architecture for
+  Automatic Speech Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14132v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14132v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tian-Hao Zhang, Dinghao Zhou, Guiping Zhong, Baoxiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  RNN-T models are widely used in ASR, which rely on the RNN-T loss to achieve
+length alignment between input audio and target sequence. However, the
+implementation complexity and the alignment-based optimization target of RNN-T
+loss lead to computational redundancy and a reduced role for predictor network,
+respectively. In this paper, we propose a novel model named CIF-Transducer
+(CIF-T) which incorporates the Continuous Integrate-and-Fire (CIF) mechanism
+with the RNN-T model to achieve efficient alignment. In this way, the RNN-T
+loss is abandoned, thus bringing a computational reduction and allowing the
+predictor network a more significant role. We also introduce Funnel-CIF,
+Context Blocks, Unified Gating and Bilinear Pooling joint network, and
+auxiliary training strategy to further improve performance. Experiments on the
+178-hour AISHELL-1 and 10000-hour WenetSpeech datasets show that CIF-T achieves
+state-of-the-art results with lower computational overhead compared to RNN-T
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantifying & Modeling Multimodal Interactions: An Information
+  Decomposition Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12247v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12247v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Pu Liang, Yun Cheng, Xiang Fan, Chun Kai Ling, Suzanne Nie, Richard Chen, Zihao Deng, Nicholas Allen, Randy Auerbach, Faisal Mahmood, Ruslan Salakhutdinov, Louis-Philippe Morency
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent explosion of interest in multimodal applications has resulted in a
+wide selection of datasets and methods for representing and integrating
+information from different modalities. Despite these empirical advances, there
+remain fundamental research questions: How can we quantify the interactions
+that are necessary to solve a multimodal task? Subsequently, what are the most
+suitable multimodal models to capture these interactions? To answer these
+questions, we propose an information-theoretic approach to quantify the degree
+of redundancy, uniqueness, and synergy relating input modalities with an output
+task. We term these three measures as the PID statistics of a multimodal
+distribution (or PID for short), and introduce two new estimators for these PID
+statistics that scale to high-dimensional distributions. To validate PID
+estimation, we conduct extensive experiments on both synthetic datasets where
+the PID is known and on large-scale multimodal benchmarks where PID estimations
+are compared with human annotations. Finally, we demonstrate their usefulness
+in (1) quantifying interactions within multimodal datasets, (2) quantifying
+interactions captured by multimodal models, (3) principled approaches for model
+selection, and (4) three real-world case studies engaging with domain experts
+in pathology, mood prediction, and robotic perception where our framework helps
+to recommend strong multimodal models for each application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at: https://github.com/pliang279/PID</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transferring Procedural Knowledge across Commonsense Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13867v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13867v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Jiang, Filip Ilievski, Kaixin Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stories about everyday situations are an essential part of human
+communication, motivating the need to develop AI agents that can reliably
+understand these stories. Despite the long list of supervised methods for story
+completion and procedural understanding, current AI has no mechanisms to
+automatically track and explain procedures in unseen stories. To bridge this
+gap, we study the ability of AI models to transfer procedural knowledge to
+novel narrative tasks in a transparent manner. We design LEAP: a comprehensive
+framework that integrates state-of-the-art modeling architectures, training
+regimes, and augmentation strategies based on both natural and synthetic
+stories. To address the lack of densely annotated training data, we devise a
+robust automatic labeler based on few-shot prompting to enhance the augmented
+data. Our experiments with in- and out-of-domain tasks reveal insights into the
+interplay of different architectures, training regimes, and augmentation
+strategies. LEAP's labeler has a clear positive impact on out-of-domain
+datasets, while the resulting dense annotation provides native explainability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VISAR: A Human-AI Argumentative Writing Assistant with Visual
+  Programming and Rapid Draft Prototyping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07810v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07810v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Zhang, Jie Gao, Ranjodh Singh Dhaliwal, Toby Jia-Jun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In argumentative writing, writers must brainstorm hierarchical writing goals,
+ensure the persuasiveness of their arguments, and revise and organize their
+plans through drafting. Recent advances in large language models (LLMs) have
+made interactive text generation through a chat interface (e.g., ChatGPT)
+possible. However, this approach often neglects implicit writing context and
+user intent, lacks support for user control and autonomy, and provides limited
+assistance for sensemaking and revising writing plans. To address these
+challenges, we introduce VISAR, an AI-enabled writing assistant system designed
+to help writers brainstorm and revise hierarchical goals within their writing
+context, organize argument structures through synchronized text editing and
+visual programming, and enhance persuasiveness with argumentation spark
+recommendations. VISAR allows users to explore, experiment with, and validate
+their writing plans using automatic draft prototyping. A controlled lab study
+confirmed the usability and effectiveness of VISAR in facilitating the
+argumentative writing planning process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, published in UIST'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rationale-Guided Few-Shot Classification to Detect Abusive Language <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.17046v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.17046v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Punyajoy Saha, Divyanshu Sheth, Kushal Kedia, Binny Mathew, Animesh Mukherjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Abusive language is a concerning problem in online social media. Past
+research on detecting abusive language covers different platforms, languages,
+demographies, etc. However, models trained using these datasets do not perform
+well in cross-domain evaluation settings. To overcome this, a common strategy
+is to use a few samples from the target domain to train models to get better
+performance in that domain (cross-domain few-shot training). However, this
+might cause the models to overfit the artefacts of those samples. A compelling
+solution could be to guide the models toward rationales, i.e., spans of text
+that justify the text's label. This method has been found to improve model
+performance in the in-domain setting across various NLP tasks. In this paper,
+we propose RGFS (Rationale-Guided Few-Shot Classification) for abusive language
+detection. We first build a multitask learning setup to jointly learn
+rationales, targets, and labels, and find a significant improvement of 6% macro
+F1 on the rationale detection task over training solely rationale classifiers.
+We introduce two rationale-integrated BERT-based architectures (the RGFS
+models) and evaluate our systems over five different abusive language datasets,
+finding that in the few-shot classification setting, RGFS-based models
+outperform baseline models by about 7% in macro F1 scores and perform
+competitively to models finetuned on other source domains. Furthermore,
+RGFS-based models outperform LIME/SHAP-based approaches in terms of
+plausibility and are close in performance in terms of faithfulness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 14 tables, 3 figures, The code repository is
+  https://github.com/punyajoy/RGFS_ECAI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Answering Climate Questionnaires from Unstructured Climate
+  Reports 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.04253v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.04253v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Spokoyny, Tanmay Laud, Tom Corringham, Taylor Berg-Kirkpatrick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The topic of Climate Change (CC) has received limited attention in NLP
+despite its urgency. Activists and policymakers need NLP tools to effectively
+process the vast and rapidly growing unstructured textual climate reports into
+structured form. To tackle this challenge we introduce two new large-scale
+climate questionnaire datasets and use their existing structure to train
+self-supervised models. We conduct experiments to show that these models can
+learn to generalize to climate disclosures of different organizations types
+than seen during training. We then use these models to help align texts from
+unstructured climate documents to the semi-structured questionnaires in a human
+pilot study. Finally, to support further NLP research in the climate domain we
+introduce a benchmark of existing climate text classification datasets to
+better evaluate and compare existing models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">125</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ To Adapt or Not to Adapt? Real-Time Adaptation for Semantic Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marc Botet Colomer, Pier Luigi Dovesi, Theodoros Panagiotakopoulos, Joao Frederico Carvalho, Linus Härenstam-Nielsen, Hossein Azizpour, Hedvig Kjellström, Daniel Cremers, Matteo Poggi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of Online Domain Adaptation for semantic segmentation is to handle
+unforeseeable domain changes that occur during deployment, like sudden weather
+events. However, the high computational costs associated with brute-force
+adaptation make this paradigm unfeasible for real-world applications. In this
+paper we propose HAMLET, a Hardware-Aware Modular Least Expensive Training
+framework for real-time domain adaptation. Our approach includes a
+hardware-aware back-propagation orchestration agent (HAMT) and a dedicated
+domain-shift detector that enables active control over when and how the model
+is adapted (LT). Thanks to these advancements, our approach is capable of
+performing semantic segmentation while simultaneously adapting at more than
+29FPS on a single consumer-grade GPU. Our framework's encouraging accuracy and
+speed trade-off is demonstrated on OnDA and SHIFT benchmarks through
+experimental results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. The first two authors contributed equally. Project page:
+  https://marcbotet.github.io/hamlet-web/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Visual Acoustic Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15064v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15064v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arjun Somayazulu, Changan Chen, Kristen Grauman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Acoustic matching aims to re-synthesize an audio clip to sound as if it were
+recorded in a target acoustic environment. Existing methods assume access to
+paired training data, where the audio is observed in both source and target
+environments, but this limits the diversity of training data or requires the
+use of simulated data or heuristics to create paired samples. We propose a
+self-supervised approach to visual acoustic matching where training samples
+include only the target scene image and audio -- without acoustically
+mismatched source audio for reference. Our approach jointly learns to
+disentangle room acoustics and re-synthesize audio into the target environment,
+via a conditional GAN framework and a novel metric that quantifies the level of
+residual acoustic information in the de-biased audio. Training with either
+in-the-wild web data or simulated data, we demonstrate it outperforms the
+state-of-the-art on multiple challenging datasets and a wide variety of
+real-world audio and environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The RoboDepth Challenge: Methods and Advancements Towards Robust Depth
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingdong Kong, Yaru Niu, Shaoyuan Xie, Hanjiang Hu, Lai Xing Ng, Benoit R. Cottereau, Ding Zhao, Liangjun Zhang, Hesheng Wang, Wei Tsang Ooi, Ruijie Zhu, Ziyang Song, Li Liu, Tianzhu Zhang, Jun Yu, Mohan Jing, Pengwei Li, Xiaohua Qi, Cheng Jin, Yingfeng Chen, Jie Hou, Jie Zhang, Zhen Kan, Qiang Ling, Liang Peng, Minglei Li, Di Xu, Changpeng Yang, Yuanqi Yao, Gang Wu, Jian Kuai, Xianming Liu, Junjun Jiang, Jiamian Huang, Baojun Li, Jiale Chen, Shuang Zhang, Sun Ao, Zhenyu Li, Runze Chen, Haiyong Luo, Fang Zhao, Jingze Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate depth estimation under out-of-distribution (OoD) scenarios, such as
+adverse weather conditions, sensor failure, and noise contamination, is
+desirable for safety-critical applications. Existing depth estimation systems,
+however, suffer inevitably from real-world corruptions and perturbations and
+are struggled to provide reliable depth predictions under such cases. In this
+paper, we summarize the winning solutions from the RoboDepth Challenge -- an
+academic competition designed to facilitate and advance robust OoD depth
+estimation. This challenge was developed based on the newly established KITTI-C
+and NYUDepth2-C benchmarks. We hosted two stand-alone tracks, with an emphasis
+on robust self-supervised and robust fully-supervised depth estimation,
+respectively. Out of more than two hundred participants, nine unique and
+top-performing solutions have appeared, with novel designs ranging from the
+following aspects: spatial- and frequency-domain augmentations, masked image
+modeling, image restoration and super-resolution, adversarial training,
+diffusion-based noise suppression, vision-language pre-training, learned model
+ensembling, and hierarchical feature enhancement. Extensive experimental
+analyses along with insightful observations are drawn to better understand the
+rationale behind each design. We hope this challenge could lay a solid
+foundation for future research on robust and reliable depth estimation and
+beyond. The datasets, competition toolkit, workshop recordings, and source code
+from the winning teams are publicly available on the challenge website.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report; 65 pages, 34 figures, 24 tables; Code at
+  https://github.com/ldkong1205/RoboDepth</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MARS: An Instance-aware, Modular and Realistic Simulator for Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zirui Wu, Tianyu Liu, Liyi Luo, Zhide Zhong, Jianteng Chen, Hongmin Xiao, Chao Hou, Haozhe Lou, Yuantao Chen, Runyi Yang, Yuxin Huang, Xiaoyu Ye, Zike Yan, Yongliang Shi, Yiyi Liao, Hao Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, autonomous cars can drive smoothly in ordinary cases, and it is
+widely recognized that realistic sensor simulation will play a critical role in
+solving remaining corner cases by simulating them. To this end, we propose an
+autonomous driving simulator based upon neural radiance fields (NeRFs).
+Compared with existing works, ours has three notable features: (1)
+Instance-aware. Our simulator models the foreground instances and background
+environments separately with independent networks so that the static (e.g.,
+size and appearance) and dynamic (e.g., trajectory) properties of instances can
+be controlled separately. (2) Modular. Our simulator allows flexible switching
+between different modern NeRF-related backbones, sampling strategies, input
+modalities, etc. We expect this modular design to boost academic progress and
+industrial deployment of NeRF-based autonomous driving simulation. (3)
+Realistic. Our simulator set new state-of-the-art photo-realism results given
+the best module selection. Our simulator will be open-sourced while most of our
+counterparts are not. Project page: https://open-air-sun.github.io/mars/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CICAI 2023, project page with code:
+  https://open-air-sun.github.io/mars/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PointOdyssey: A Large-Scale Synthetic <span class="highlight-title">Dataset</span> for Long-Term Point
+  Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15055v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15055v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Zheng, Adam W. Harley, Bokui Shen, Gordon Wetzstein, Leonidas J. Guibas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce PointOdyssey, a large-scale synthetic dataset, and data
+generation framework, for the training and evaluation of long-term fine-grained
+tracking algorithms. Our goal is to advance the state-of-the-art by placing
+emphasis on long videos with naturalistic motion. Toward the goal of
+naturalism, we animate deformable characters using real-world motion capture
+data, we build 3D scenes to match the motion capture environments, and we
+render camera viewpoints using trajectories mined via structure-from-motion on
+real videos. We create combinatorial diversity by randomizing character
+appearance, motion profiles, materials, lighting, 3D assets, and atmospheric
+effects. Our dataset currently includes 104 videos, averaging 2,000 frames
+long, with orders of magnitude more correspondence annotations than prior work.
+We show that existing methods can be trained from scratch in our dataset and
+outperform the published variants. Finally, we introduce modifications to the
+PIPs point tracking method, greatly widening its temporal receptive field,
+which improves its performance on PointOdyssey as well as on two real-world
+benchmarks. Our data and code are publicly available at:
+https://pointodyssey.com
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Depth Estimation for Transparent and Mirror Surfaces <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Costanzino, Pierluigi Zama Ramirez, Matteo Poggi, Fabio Tosi, Stefano Mattoccia, Luigi Di Stefano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inferring the depth of transparent or mirror (ToM) surfaces represents a hard
+challenge for either sensors, algorithms, or deep networks. We propose a simple
+pipeline for learning to estimate depth properly for such surfaces with neural
+networks, without requiring any ground-truth annotation. We unveil how to
+obtain reliable pseudo labels by in-painting ToM objects in images and
+processing them with a monocular depth estimation model. These labels can be
+used to fine-tune existing monocular or stereo networks, to let them learn how
+to deal with ToM surfaces. Experimental results on the Booster dataset show the
+dramatic improvements enabled by our remarkably simple proposal.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023. Project Page:
+  https://cvlab-unibo.github.io/Depth4ToM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Regularized Mask Tuning: Uncovering Hidden Knowledge in <span class="highlight-title">Pre-train</span>ed
+  Vision-Language Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kecheng Zheng, Wei Wu, Ruili Feng, Kai Zhu, Jiawei Liu, Deli Zhao, Zheng-Jun Zha, Wei Chen, Yujun Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt tuning and adapter tuning have shown great potential in transferring
+pre-trained vision-language models (VLMs) to various downstream tasks. In this
+work, we design a new type of tuning method, termed as regularized mask tuning,
+which masks the network parameters through a learnable selection. Inspired by
+neural pathways, we argue that the knowledge required by a downstream task
+already exists in the pre-trained weights but just gets concealed in the
+upstream pre-training stage. To bring the useful knowledge back into light, we
+first identify a set of parameters that are important to a given downstream
+task, then attach a binary mask to each parameter, and finally optimize these
+masks on the downstream data with the parameters frozen. When updating the
+mask, we introduce a novel gradient dropout strategy to regularize the
+parameter selection, in order to prevent the model from forgetting old
+knowledge and overfitting the downstream data. Experimental results on 11
+datasets demonstrate the consistent superiority of our method over previous
+alternatives. It is noteworthy that we manage to deliver 18.73% performance
+improvement compared to the zero-shot CLIP via masking an average of only 2.56%
+parameters. Furthermore, our method is synergistic with most existing
+parameter-efficient tuning methods and can boost the performance on top of
+them. Project page can be found here (https://wuw2019.github.io/RMT/).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Transformer</span>-based Approach for Arabic Offline Handwritten Text
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15045v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15045v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saleh Momeni, Bagher BabaAli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Handwriting recognition is a challenging and critical problem in the fields
+of pattern recognition and machine learning, with applications spanning a wide
+range of domains. In this paper, we focus on the specific issue of recognizing
+offline Arabic handwritten text. Existing approaches typically utilize a
+combination of convolutional neural networks for image feature extraction and
+recurrent neural networks for temporal modeling, with connectionist temporal
+classification used for text generation. However, these methods suffer from a
+lack of parallelization due to the sequential nature of recurrent neural
+networks. Furthermore, these models cannot account for linguistic rules,
+necessitating the use of an external language model in the post-processing
+stage to boost accuracy. To overcome these issues, we introduce two alternative
+architectures, namely the Transformer Transducer and the standard
+sequence-to-sequence Transformer, and compare their performance in terms of
+accuracy and speed. Our approach can model language dependencies and relies
+only on the attention mechanism, thereby making it more parallelizable and less
+complex. We employ pre-trained Transformers for both image understanding and
+language modeling. Our evaluation on the Arabic KHATT dataset demonstrates that
+our proposed method outperforms the current state-of-the-art approaches for
+recognizing offline Arabic handwritten text.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TEDi: Temporally-Entangled Diffusion for Long-Term Motion Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15042v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15042v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Zhang, Richard Liu, Kfir Aberman, Rana Hanocka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The gradual nature of a diffusion process that synthesizes samples in small
+increments constitutes a key ingredient of Denoising Diffusion Probabilistic
+Models (DDPM), which have presented unprecedented quality in image synthesis
+and been recently explored in the motion domain. In this work, we propose to
+adapt the gradual diffusion concept (operating along a diffusion time-axis)
+into the temporal-axis of the motion sequence. Our key idea is to extend the
+DDPM framework to support temporally varying denoising, thereby entangling the
+two axes. Using our special formulation, we iteratively denoise a motion buffer
+that contains a set of increasingly-noised poses, which auto-regressively
+produces an arbitrarily long stream of frames. With a stationary diffusion
+time-axis, in each diffusion step we increment only the temporal-axis of the
+motion such that the framework produces a new, clean frame which is removed
+from the beginning of the buffer, followed by a newly drawn noise vector that
+is appended to it. This new mechanism paves the way towards a new framework for
+long-term motion synthesis with applications to character animation and other
+domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://threedle.github.io/TEDi/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diverse Inpainting and Editing with GAN Inversion <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15033v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15033v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmet Burak Yildirim, Hamza Pehlivan, Bahri Batuhan Bilecen, Aysegul Dundar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent inversion methods have shown that real images can be inverted into
+StyleGAN's latent space and numerous edits can be achieved on those images
+thanks to the semantically rich feature representations of well-trained GAN
+models. However, extensive research has also shown that image inversion is
+challenging due to the trade-off between high-fidelity reconstruction and
+editability. In this paper, we tackle an even more difficult task, inverting
+erased images into GAN's latent space for realistic inpaintings and editings.
+Furthermore, by augmenting inverted latent codes with different latent samples,
+we achieve diverse inpaintings. Specifically, we propose to learn an encoder
+and mixing network to combine encoded features from erased images with
+StyleGAN's mapped features from random samples. To encourage the mixing network
+to utilize both inputs, we train the networks with generated data via a novel
+set-up. We also utilize higher-rate features to prevent color inconsistencies
+between the inpainted and unerased parts. We run extensive experiments and
+compare our method with state-of-the-art inversion and inpainting methods.
+Qualitative metrics and visual comparisons show significant improvements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Segmentation Network for Scene Text Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15029v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15029v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guiqin Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inspired by deep convolution segmentation algorithms, scene text detectors
+break the performance ceiling of datasets steadily. However, these methods
+often encounter threshold selection bottlenecks and have poor performance on
+text instances with extreme aspect ratios. In this paper, we propose to
+automatically learn the discriminate segmentation threshold, which
+distinguishes text pixels from background pixels for segmentation-based scene
+text detectors and then further reduces the time-consuming manual parameter
+adjustment. Besides, we design a Global-information Enhanced Feature Pyramid
+Network (GE-FPN) for capturing text instances with macro size and extreme
+aspect ratios. Following the GE-FPN, we introduce a cascade optimization
+structure to further refine the text instances. Finally, together with the
+proposed threshold learning strategy and text detection structure, we design an
+Adaptive Segmentation Network (ASNet) for scene text detection. Extensive
+experiments are carried out to demonstrate that the proposed ASNet can achieve
+the state-of-the-art performance on four text detection benchmarks, i.e., ICDAR
+2015, MSRA-TD500, ICDAR 2017 MLT and CTW1500. The ablation experiments also
+verify the effectiveness of our contributions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Graph <span class="highlight-title">Transformer</span> for Deepfake Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aminollah Khormali, Jiann-Shiun Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deepfake detection methods have shown promising results in recognizing
+forgeries within a given dataset, where training and testing take place on the
+in-distribution dataset. However, their performance deteriorates significantly
+when presented with unseen samples. As a result, a reliable deepfake detection
+system must remain impartial to forgery types, appearance, and quality for
+guaranteed generalizable detection performance. Despite various attempts to
+enhance cross-dataset generalization, the problem remains challenging,
+particularly when testing against common post-processing perturbations, such as
+video compression or blur. Hence, this study introduces a deepfake detection
+framework, leveraging a self-supervised pre-training model that delivers
+exceptional generalization ability, withstanding common corruptions and
+enabling feature explainability. The framework comprises three key components:
+a feature extractor based on vision Transformer architecture that is
+pre-trained via self-supervised contrastive learning methodology, a graph
+convolution network coupled with a Transformer discriminator, and a graph
+Transformer relevancy map that provides a better understanding of manipulated
+regions and further explains the model's decision. To assess the effectiveness
+of the proposed framework, several challenging experiments are conducted,
+including in-data distribution performance, cross-dataset, cross-manipulation
+generalization, and robustness against common post-production perturbations.
+The results achieved demonstrate the remarkable effectiveness of the proposed
+deepfake detection framework, surpassing the current state-of-the-art
+approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Good is Google Bard's Visual Understanding? An Empirical Study on
+  Open Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotong Qin, Ge-Peng Ji, Salman Khan, Deng-Ping Fan, Fahad Shahbaz Khan, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Google's Bard has emerged as a formidable competitor to OpenAI's ChatGPT in
+the field of conversational AI. Notably, Bard has recently been updated to
+handle visual inputs alongside text prompts during conversations. Given Bard's
+impressive track record in handling textual inputs, we explore its capabilities
+in understanding and interpreting visual data (images) conditioned by text
+questions. This exploration holds the potential to unveil new insights and
+challenges for Bard and other forthcoming multi-modal Generative models,
+especially in addressing complex computer vision problems that demand accurate
+visual and language understanding. Specifically, in this study, we focus on 15
+diverse task scenarios encompassing regular, camouflaged, medical, under-water
+and remote sensing data to comprehensively evaluate Bard's performance. Our
+primary finding indicates that Bard still struggles in these vision scenarios,
+highlighting the significant gap in vision-based understanding that needs to be
+bridged in future developments. We expect that this empirical study will prove
+valuable in advancing future models, leading to enhanced capabilities in
+comprehending and interpreting fine-grained visual data. Our project is
+released on https://github.com/htqin/GoogleBard-VisUnderstand
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Verifiable Feature Attributions: A Bridge between Post Hoc
+  Explainability and Inherent Interpretability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15007v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15007v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Usha Bhalla, Suraj Srinivas, Himabindu Lakkaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the increased deployment of machine learning models in various
+real-world applications, researchers and practitioners alike have emphasized
+the need for explanations of model behaviour. To this end, two broad strategies
+have been outlined in prior literature to explain models. Post hoc explanation
+methods explain the behaviour of complex black-box models by highlighting
+features that are critical to model predictions; however, prior work has shown
+that these explanations may not be faithful, and even more concerning is our
+inability to verify them. Specifically, it is nontrivial to evaluate if a given
+attribution is correct with respect to the underlying model. Inherently
+interpretable models, on the other hand, circumvent these issues by explicitly
+encoding explanations into model architecture, meaning their explanations are
+naturally faithful and verifiable, but they often exhibit poor predictive
+performance due to their limited expressive power. In this work, we aim to
+bridge the gap between the aforementioned strategies by proposing Verifiability
+Tuning (VerT), a method that transforms black-box models into models that
+naturally yield faithful and verifiable feature attributions. We begin by
+introducing a formal theoretical framework to understand verifiability and show
+that attributions produced by standard models cannot be verified. We then
+leverage this framework to propose a method to build verifiable models and
+feature attributions out of fully trained black-box models. Finally, we perform
+extensive experiments on semi-synthetic and real-world datasets, and show that
+VerT produces models that (1) yield explanations that are correct and
+verifiable and (2) are faithful to the original black-box models they are meant
+to explain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MapNeRF: Incorporating Map Priors into Neural Radiance Fields for
+  Driving View Simulation <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenming Wu, Jiadai Sun, Zhelun Shen, Liangjun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simulating camera sensors is a crucial task in autonomous driving. Although
+neural radiance fields are exceptional at synthesizing photorealistic views in
+driving simulations, they still fail in generating extrapolated views. This
+paper proposes to incorporate map priors into neural radiance fields to
+synthesize out-of-trajectory driving views with semantic road consistency. The
+key insight is that map information can be utilized as a prior to guide the
+training of the radiance fields with uncertainty. Specifically, we utilize the
+coarse ground surface as uncertain information to supervise the density field
+and warp depth with uncertainty from unknown camera poses to ensure multi-view
+consistency. Experimental results demonstrate that our approach can produce
+semantic consistency in deviated views for vehicle camera simulation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE/RSJ International Conference on Intelligent Robots
+  and Systems (IROS) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Take-A-Photo: 3D-to-2D Generative <span class="highlight-title">Pre-train</span>ing of Point Cloud Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyi Wang, Xumin Yu, Yongming Rao, Jie Zhou, Jiwen Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the overwhelming trend of mask image modeling led by MAE, generative
+pre-training has shown a remarkable potential to boost the performance of
+fundamental models in 2D vision. However, in 3D vision, the over-reliance on
+Transformer-based backbones and the unordered nature of point clouds have
+restricted the further development of generative pre-training. In this paper,
+we propose a novel 3D-to-2D generative pre-training method that is adaptable to
+any point cloud model. We propose to generate view images from different
+instructed poses via the cross-attention mechanism as the pre-training scheme.
+Generating view images has more precise supervision than its point cloud
+counterpart, thus assisting 3D backbones to have a finer comprehension of the
+geometrical structure and stereoscopic relations of the point cloud.
+Experimental results have proved the superiority of our proposed 3D-to-2D
+generative pre-training over previous pre-training methods. Our method is also
+effective in boosting the performance of architecture-oriented approaches,
+achieving state-of-the-art performance when fine-tuning on ScanObjectNN
+classification and ShapeNetPart segmentation tasks. Code is available at
+https://github.com/wangzy22/TAP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023, project page: https://tap.ivg-research.xyz</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Model Aggregation via <span class="highlight-title">Self-Supervised</span> Priors for Highly
+  Imbalanced Medical Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14959v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14959v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marawan Elbatel, Hualiang Wang, Robert Martí, Huazhu Fu, Xiaomeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the medical field, federated learning commonly deals with highly
+imbalanced datasets, including skin lesions and gastrointestinal images.
+Existing federated methods under highly imbalanced datasets primarily focus on
+optimizing a global model without incorporating the intra-class variations that
+can arise in medical imaging due to different populations, findings, and
+scanners. In this paper, we study the inter-client intra-class variations with
+publicly available self-supervised auxiliary networks. Specifically, we find
+that employing a shared auxiliary pre-trained model, like MoCo-V2, locally on
+every client yields consistent divergence measurements. Based on these
+findings, we derive a dynamic balanced model aggregation via self-supervised
+priors (MAS) to guide the global model optimization. Fed-MAS can be utilized
+with different local learning methods for effective model aggregation toward a
+highly robust and unbiased global model. Our code is available at
+\url{https://github.com/xmed-lab/Fed-MAS}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GET3D--: Learning GET3D from Unconstrained Image Collections 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14918v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14918v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanghua Yu, Xintao Wang, Zheyuan Li, Yan-Pei Cao, Ying Shan, Chao Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The demand for efficient 3D model generation techniques has grown
+exponentially, as manual creation of 3D models is time-consuming and requires
+specialized expertise. While generative models have shown potential in creating
+3D textured shapes from 2D images, their applicability in 3D industries is
+limited due to the lack of a well-defined camera distribution in real-world
+scenarios, resulting in low-quality shapes. To overcome this limitation, we
+propose GET3D--, the first method that directly generates textured 3D shapes
+from 2D images with unknown pose and scale. GET3D-- comprises a 3D shape
+generator and a learnable camera sampler that captures the 6D external changes
+on the camera. In addition, We propose a novel training schedule to stably
+optimize both the shape generator and camera sampler in a unified framework. By
+controlling external variations using the learnable camera sampler, our method
+can generate aligned shapes with clear textures. Extensive experiments
+demonstrate the efficacy of GET3D--, which precisely fits the 6D camera pose
+distribution and generates high-quality shapes on both synthetic and realistic
+unconstrained datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NSA: Naturalistic Support Artifact to Boost Network Confidence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhijith Sharma, Phil Munz, Apurva Narayan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual AI systems are vulnerable to natural and synthetic physical corruption
+in the real-world. Such corruption often arises unexpectedly and alters the
+model's performance. In recent years, the primary focus has been on adversarial
+attacks. However, natural corruptions (e.g., snow, fog, dust) are an
+omnipresent threat to visual AI systems and should be considered equally
+important. Many existing works propose interesting solutions to train robust
+models against natural corruption. These works either leverage image
+augmentations, which come with the additional cost of model training, or place
+suspicious patches in the scene to design unadversarial examples. In this work,
+we propose the idea of naturalistic support artifacts (NSA) for robust
+prediction. The NSAs are shown to be beneficial in scenarios where model
+parameters are inaccessible and adding artifacts in the scene is feasible. The
+NSAs are natural looking objects generated through artifact training using
+DC-GAN to have high visual fidelity in the scene. We test against natural
+corruptions on the Imagenette dataset and observe the improvement in prediction
+confidence score by four times. We also demonstrate NSA's capability to
+increase adversarial accuracy by 8\% on average. Lastly, we qualitatively
+analyze NSAs using saliency maps to understand how they help improve prediction
+confidence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weakly Supervised AI for Efficient Analysis of 3D Pathology Samples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew H. Song, Mane Williams, Drew F. K. Williamson, Guillaume Jaume, Andrew Zhang, Bowen Chen, Robert Serafin, Jonathan T. C. Liu, Alex Baras, Anil V. Parwani, Faisal Mahmood
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human tissue and its constituent cells form a microenvironment that is
+fundamentally three-dimensional (3D). However, the standard-of-care in
+pathologic diagnosis involves selecting a few two-dimensional (2D) sections for
+microscopic evaluation, risking sampling bias and misdiagnosis. Diverse methods
+for capturing 3D tissue morphologies have been developed, but they have yet had
+little translation to clinical practice; manual and computational evaluations
+of such large 3D data have so far been impractical and/or unable to provide
+patient-level clinical insights. Here we present Modality-Agnostic Multiple
+instance learning for volumetric Block Analysis (MAMBA), a deep-learning-based
+platform for processing 3D tissue images from diverse imaging modalities and
+predicting patient outcomes. Archived prostate cancer specimens were imaged
+with open-top light-sheet microscopy or microcomputed tomography and the
+resulting 3D datasets were used to train risk-stratification networks based on
+5-year biochemical recurrence outcomes via MAMBA. With the 3D block-based
+approach, MAMBA achieves an area under the receiver operating characteristic
+curve (AUC) of 0.86 and 0.74, superior to 2D traditional single-slice-based
+prognostication (AUC of 0.79 and 0.57), suggesting superior prognostication
+with 3D morphological features. Further analyses reveal that the incorporation
+of greater tissue volume improves prognostic performance and mitigates risk
+prediction variability from sampling bias, suggesting the value of capturing
+larger extents of heterogeneous 3D morphology. With the rapid growth and
+adoption of 3D spatial biology and pathology techniques by researchers and
+clinicians, MAMBA provides a general and efficient framework for 3D weakly
+supervised learning for clinical decision support and can help to reveal novel
+3D morphological biomarkers for prognosis and therapeutic response.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-guided Foundation Model Adaptation for Pathological Image
+  Classification <span class="chip">MICCAI2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunkun Zhang, Jin Gao, Mu Zhou, Xiaosong Wang, Yu Qiao, Shaoting Zhang, Dequan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent surge of foundation models in computer vision and natural language
+processing opens up perspectives in utilizing multi-modal clinical data to
+train large models with strong generalizability. Yet pathological image
+datasets often lack biomedical text annotation and enrichment. Guiding
+data-efficient image diagnosis from the use of biomedical text knowledge
+becomes a substantial interest. In this paper, we propose to Connect Image and
+Text Embeddings (CITE) to enhance pathological image classification. CITE
+injects text insights gained from language models pre-trained with a broad
+range of biomedical texts, leading to adapt foundation models towards
+pathological image understanding. Through extensive experiments on the
+PatchGastric stomach tumor pathological image dataset, we demonstrate that CITE
+achieves leading performance compared with various baselines especially when
+training data is scarce. CITE offers insights into leveraging in-domain text
+knowledge to reinforce data-efficient pathological image classification. Code
+is available at https://github.com/Yunkun-Zhang/CITE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mixture of <span class="highlight-title">Self-Supervised</span> Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aristo Renaldo Ruslim, Novanto Yudistira, Budi Darma Setiawan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning is popular method because of its ability to learn
+features in images without using its labels and is able to overcome limited
+labeled datasets used in supervised learning. Self-supervised learning works by
+using a pretext task which will be trained on the model before being applied to
+a specific task. There are some examples of pretext tasks used in
+self-supervised learning in the field of image recognition, namely rotation
+prediction, solving jigsaw puzzles, and predicting relative positions on image.
+Previous studies have only used one type of transformation as a pretext task.
+This raises the question of how it affects if more than one pretext task is
+used and to use a gating network to combine all pretext tasks. Therefore, we
+propose the Gated Self-Supervised Learning method to improve image
+classification which use more than one transformation as pretext task and uses
+the Mixture of Expert architecture as a gating network in combining each
+pretext task so that the model automatically can study and focus more on the
+most useful augmentations for classification. We test performance of the
+proposed method in several scenarios, namely CIFAR imbalance dataset
+classification, adversarial perturbations, Tiny-Imagenet dataset
+classification, and semi-supervised learning. Moreover, there are Grad-CAM and
+T-SNE analysis that are used to see the proposed method for identifying
+important features that influence image classification and representing data
+for each class and separating different classes properly. Our code is in
+https://github.com/aristorenaldo/G-SSL
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weakly Supervised Multi-Modal 3D Human Body Pose Estimation for
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Bauer, Arij Bouazizi, Ulrich Kressel, Fabian B. Flohr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate 3D human pose estimation (3D HPE) is crucial for enabling autonomous
+vehicles (AVs) to make informed decisions and respond proactively in critical
+road scenarios. Promising results of 3D HPE have been gained in several domains
+such as human-computer interaction, robotics, sports and medical analytics,
+often based on data collected in well-controlled laboratory environments.
+Nevertheless, the transfer of 3D HPE methods to AVs has received limited
+research attention, due to the challenges posed by obtaining accurate 3D pose
+annotations and the limited suitability of data from other domains.
+  We present a simple yet efficient weakly supervised approach for 3D HPE in
+the AV context by employing a high-level sensor fusion between camera and LiDAR
+data. The weakly supervised setting enables training on the target datasets
+without any 2D/3D keypoint labels by using an off-the-shelf 2D joint extractor
+and pseudo labels generated from LiDAR to image projections. Our approach
+outperforms state-of-the-art results by up to $\sim$ 13% on the Waymo Open
+Dataset in the weakly supervised setting and achieves state-of-the-art results
+in the supervised setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, Accepted at IEEE-IV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sample Less, Learn More: Efficient Action Recognition via Frame Feature
+  Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14866v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14866v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harry Cheng, Yangyang Guo, Liqiang Nie, Zhiyong Cheng, Mohan Kankanhalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training an effective video action recognition model poses significant
+computational challenges, particularly under limited resource budgets. Current
+methods primarily aim to either reduce model size or utilize pre-trained
+models, limiting their adaptability to various backbone architectures. This
+paper investigates the issue of over-sampled frames, a prevalent problem in
+many approaches yet it has received relatively little attention. Despite the
+use of fewer frames being a potential solution, this approach often results in
+a substantial decline in performance. To address this issue, we propose a novel
+method to restore the intermediate features for two sparsely sampled and
+adjacent video frames. This feature restoration technique brings a negligible
+increase in computational requirements compared to resource-intensive image
+encoders, such as ViT. To evaluate the effectiveness of our method, we conduct
+extensive experiments on four public datasets, including Kinetics-400,
+ActivityNet, UCF-101, and HMDB-51. With the integration of our method, the
+efficiency of three commonly used baselines has been improved by over 50%, with
+a mere 0.5% reduction in recognition accuracy. In addition, our method also
+surprisingly helps improve the generalization ability of the models under
+zero-shot settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages. Code and pretrained weight will be released at
+  https://github.com/xaCheng1996/SLLM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A full-resolution training framework for Sentinel-2 image fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Ciotola, Mario Ragosta, Giovanni Poggi, Giuseppe Scarpa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents a new unsupervised framework for training deep learning
+models for super-resolution of Sentinel-2 images by fusion of its 10-m and 20-m
+bands. The proposed scheme avoids the resolution downgrade process needed to
+generate training data in the supervised case. On the other hand, a proper loss
+that accounts for cycle-consistency between the network prediction and the
+input components to be fused is proposed. Despite its unsupervised nature, in
+our preliminary experiments the proposed scheme has shown promising results in
+comparison to the supervised approach. Besides, by construction of the proposed
+loss, the resulting trained network can be ascribed to the class of
+multi-resolution analysis methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IML-ViT: Image Manipulation Localization by Vision <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14863v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14863v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaochen Ma, Bo Du, Xianggen Liu, Ahmed Y. Al Hammadi, Jizhe Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advanced image tampering techniques are increasingly challenging the
+trustworthiness of multimedia, leading to the development of Image Manipulation
+Localization (IML). But what makes a good IML model? The answer lies in the way
+to capture artifacts. Exploiting artifacts requires the model to extract
+non-semantic discrepancies between the manipulated and authentic regions, which
+needs to compare differences between these two areas explicitly. With the
+self-attention mechanism, naturally, the Transformer is the best candidate.
+Besides, artifacts are sensitive to image resolution, amplified under
+multi-scale features, and massive at the manipulation border. Therefore, we
+formulate the answer to the former question as building a ViT with
+high-resolution capacity, multi-scale feature extraction capability, and
+manipulation edge supervision. We term this simple but effective ViT paradigm
+as the IML-ViT, which has great potential to become a new benchmark for IML.
+Extensive experiments on five benchmark datasets verified our model outperforms
+the state-of-the-art manipulation localization methods. Code and models are
+available at \url{https://github.com/SunnyHaze/IML-ViT}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Evaluation of Digital and Analog Chest Radiographs to
+  Identify Tuberculosis using Deep Learning Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14859v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14859v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Subhankar Chattoraj, Bhargava Reddy, Manoj Tadepalli, Preetham Putha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose:Chest X-ray (CXR) is an essential tool and one of the most prescribed
+imaging to detect pulmonary abnormalities, with a yearly estimate of over 2
+billion imaging performed worldwide. However, the accurate and timely diagnosis
+of TB remains an unmet goal. The prevalence of TB is highest in
+low-middle-income countries, and the requirement of a portable, automated, and
+reliable solution is required. In this study, we compared the performance of
+DL-based devices on digital and analog CXR. The evaluated DL-based device can
+be used in resource-constraint settings. Methods: A total of 10,000 CXR
+DICOMs(.dcm) and printed photos of the films acquired with three different
+cellular phones - Samsung S8, iPhone 8, and iPhone XS along with their
+radiological report were retrospectively collected from various sites across
+India from April 2020 to March 2021. Results: 10,000 chest X-rays were utilized
+to evaluate the DL-based device in identifying radiological signs of TB. The
+AUC of qXR for detecting signs of tuberculosis on the original DICOMs dataset
+was 0.928 with a sensitivity of 0.841 at a specificity of 0.806. At an optimal
+threshold, the difference in the AUC of three cellular smartphones with the
+original DICOMs is 0.024 (2.55%), 0.048 (5.10%), and 0.038 (1.91%). The minimum
+difference demonstrates the robustness of the DL-based device in identifying
+radiological signs of TB in both digital and analog CXR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simplified Concrete Dropout -- Improving the Generation of Attribution
+  Masks for Fine-grained Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitri Korsch, Maha Shadaydeh, Joachim Denzler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-grained classification is a particular case of a classification problem,
+aiming to classify objects that share the visual appearance and can only be
+distinguished by subtle differences. Fine-grained classification models are
+often deployed to determine animal species or individuals in automated animal
+monitoring systems. Precise visual explanations of the model's decision are
+crucial to analyze systematic errors. Attention- or gradient-based methods are
+commonly used to identify regions in the image that contribute the most to the
+classification decision. These methods deliver either too coarse or too noisy
+explanations, unsuitable for identifying subtle visual differences reliably.
+However, perturbation-based methods can precisely identify pixels causally
+responsible for the classification result. Fill-in of the dropout (FIDO)
+algorithm is one of those methods. It utilizes the concrete dropout (CD) to
+sample a set of attribution masks and updates the sampling parameters based on
+the output of the classification model. A known problem of the algorithm is a
+high variance in the gradient estimates, which the authors have mitigated until
+now by mini-batch updates of the sampling parameters. This paper presents a
+solution to circumvent these computational instabilities by simplifying the CD
+sampling and reducing reliance on large mini-batch sizes. First, it allows
+estimating the parameters with smaller mini-batch sizes without losing the
+quality of the estimates but with a reduced computational effort. Furthermore,
+our solution produces finer and more coherent attribution masks. Finally, we
+use the resulting attribution masks to improve the classification performance
+of a trained model without additional fine-tuning of the model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the German Conference on Pattern Recognition 2023 (GCPR
+  2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fading memory as inductive bias in residual recurrent networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Igor Dubinin, Felix Effenberger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Residual connections have been proposed as architecture-based inductive bias
+to mitigate the problem of exploding and vanishing gradients and increase task
+performance in both feed-forward and recurrent networks (RNNs) when trained
+with the backpropagation algorithm. Yet, little is known about how residual
+connections in RNNs influence their dynamics and fading memory properties.
+Here, we introduce weakly coupled residual recurrent networks (WCRNNs) in which
+residual connections result in well-defined Lyapunov exponents and allow for
+studying properties of fading memory. We investigate how the residual
+connections of WCRNNs influence their performance, network dynamics, and memory
+properties on a set of benchmark tasks. We show that several distinct forms of
+residual connections yield effective inductive biases that result in increased
+network expressivity. In particular, residual connections that (i) result in
+network dynamics at the proximity of the edge of chaos, (ii) allow networks to
+capitalize on characteristic spectral properties of the data, and (iii) result
+in heterogeneous memory properties are shown to increase practical
+expressivity. In addition, we demonstrate how our results can be extended to
+non-linear residuals and introduce a weakly coupled residual initialization
+scheme that can be used for Elman RNNs
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Deeply Unified Depth-aware Panoptic Segmentation with
+  Bi-directional Guidance Learning <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14786v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14786v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junwen He, Yifan Wang, Lijun Wang, Huchuan Lu, Jun-Yan He, Jin-Peng Lan, Bin Luo, Yifeng Geng, Xuansong Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth-aware panoptic segmentation is an emerging topic in computer vision
+which combines semantic and geometric understanding for more robust scene
+interpretation. Recent works pursue unified frameworks to tackle this challenge
+but mostly still treat it as two individual learning tasks, which limits their
+potential for exploring cross-domain information. We propose a deeply unified
+framework for depth-aware panoptic segmentation, which performs joint
+segmentation and depth estimation both in a per-segment manner with identical
+object queries. To narrow the gap between the two tasks, we further design a
+geometric query enhancement method, which is able to integrate scene geometry
+into object queries using latent representations. In addition, we propose a
+bi-directional guidance learning approach to facilitate cross-task feature
+learning by taking advantage of their mutual relations. Our method sets the new
+state of the art for depth-aware panoptic segmentation on both Cityscapes-DVPS
+and SemKITTI-DVPS datasets. Moreover, our guidance learning approach is shown
+to deliver performance improvement even under incomplete supervision labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrastive Knowledge Amalgamation for Unsupervised Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14781v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14781v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shangde Gao, Yichao Fu, Ke Liu, Yuqiang Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge amalgamation (KA) aims to learn a compact student model to handle
+the joint objective from multiple teacher models that are are specialized for
+their own tasks respectively. Current methods focus on coarsely aligning
+teachers and students in the common representation space, making it difficult
+for the student to learn the proper decision boundaries from a set of
+heterogeneous teachers. Besides, the KL divergence in previous works only
+minimizes the probability distribution difference between teachers and the
+student, ignoring the intrinsic characteristics of teachers. Therefore, we
+propose a novel Contrastive Knowledge Amalgamation (CKA) framework, which
+introduces contrastive losses and an alignment loss to achieve intra-class
+cohesion and inter-class separation.Contrastive losses intra- and inter- models
+are designed to widen the distance between representations of different
+classes. The alignment loss is introduced to minimize the sample-level
+distribution differences of teacher-student models in the common representation
+space.Furthermore, the student learns heterogeneous unsupervised classification
+tasks through soft targets efficiently and flexibly in the task-level
+amalgamation. Extensive experiments on benchmarks demonstrate the
+generalization capability of CKA in the amalgamation of specific task as well
+as multiple tasks. Comprehensive ablation studies provide a further insight
+into our CKA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2112.07327 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ pCTFusion: Point Convolution-<span class="highlight-title">Transformer</span> Fusion with Semantic Aware Loss
+  for Outdoor LiDAR Point Cloud Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14777v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14777v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Kuriyal, Vaibhav Kumar, Bharat Lohani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR-generated point clouds are crucial for perceiving outdoor environments.
+The segmentation of point clouds is also essential for many applications.
+Previous research has focused on using self-attention and convolution (local
+attention) mechanisms individually in semantic segmentation architectures.
+However, there is limited work on combining the learned representations of
+these attention mechanisms to improve performance. Additionally, existing
+research that combines convolution with self-attention relies on global
+attention, which is not practical for processing large point clouds. To address
+these challenges, this study proposes a new architecture, pCTFusion, which
+combines kernel-based convolutions and self-attention mechanisms for better
+feature learning and capturing local and global dependencies in segmentation.
+The proposed architecture employs two types of self-attention mechanisms, local
+and global, based on the hierarchical positions of the encoder blocks.
+Furthermore, the existing loss functions do not consider the semantic and
+position-wise importance of the points, resulting in reduced accuracy,
+particularly at sharp class boundaries. To overcome this, the study models a
+novel attention-based loss function called Pointwise Geometric Anisotropy
+(PGA), which assigns weights based on the semantic distribution of points in a
+neighborhood. The proposed architecture is evaluated on SemanticKITTI outdoor
+dataset and showed a 5-7% improvement in performance compared to the
+state-of-the-art architectures. The results are particularly encouraging for
+minor classes, often misclassified due to class imbalance, lack of space, and
+neighbor-aware feature encoding. These developed methods can be leveraged for
+the segmentation of complex datasets and can drive real-world applications of
+LiDAR point cloud.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 8 Figures, 5 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Full-Head 3D GANs from a Single-View Portrait <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14770v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14770v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiqian Wu, Hao Xu, Xiangjun Tang, Hongbo Fu, Xiaogang Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  33D-aware face generators are commonly trained on 2D real-life face image
+datasets. Nevertheless, existing facial recognition methods often struggle to
+extract face data captured from various camera angles. Furthermore, in-the-wild
+images with diverse body poses introduce a high-dimensional challenge for
+3D-aware generators, making it difficult to utilize data that contains complete
+neck and shoulder regions. Consequently, these face image datasets often
+contain only near-frontal face data, which poses challenges for 3D-aware face
+generators to construct \textit{full-head} 3D portraits. To this end, we first
+create the dataset {$\it{360}^{\circ}$}-\textit{Portrait}-\textit{HQ}
+(\textit{$\it{360}^{\circ}$PHQ}), which consists of high-quality single-view
+real portraits annotated with a variety of camera parameters {(the yaw angles
+span the entire $360^{\circ}$ range)} and body poses. We then propose
+\textit{3DPortraitGAN}, the first 3D-aware full-head portrait generator that
+learns a canonical 3D avatar distribution from the body-pose-various
+\textit{$\it{360}^{\circ}$PHQ} dataset with body pose self-learning. Our model
+can generate view-consistent portrait images from all camera angles
+(${360}^{\circ}$) with a full-head 3D representation. We incorporate a
+mesh-guided deformation field into volumetric rendering to produce deformed
+results to generate portrait images that conform to the body pose distribution
+of the dataset using our canonical generator. We integrate two pose predictors
+into our framework to predict more accurate body poses to address the issue of
+inaccurately estimated body poses in our dataset. Our experiments show that the
+proposed framework can generate view-consistent, realistic portrait images with
+complete geometry from all camera angles and accurately predict portrait body
+pose.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gloss-free Sign Language Translation: Improving from Visual-Language
+  <span class="highlight-title">Pretrain</span>ing <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14768v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14768v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjia Zhou, Zhigang Chen, Albert Clapés, Jun Wan, Yanyan Liang, Sergio Escalera, Zhen Lei, Du Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sign Language Translation (SLT) is a challenging task due to its cross-domain
+nature, involving the translation of visual-gestural language to text. Many
+previous methods employ an intermediate representation, i.e., gloss sequences,
+to facilitate SLT, thus transforming it into a two-stage task of sign language
+recognition (SLR) followed by sign language translation (SLT). However, the
+scarcity of gloss-annotated sign language data, combined with the information
+bottleneck in the mid-level gloss representation, has hindered the further
+development of the SLT task. To address this challenge, we propose a novel
+Gloss-Free SLT based on Visual-Language Pretraining (GFSLT-VLP), which improves
+SLT by inheriting language-oriented prior knowledge from pre-trained models,
+without any gloss annotation assistance. Our approach involves two stages: (i)
+integrating Contrastive Language-Image Pre-training (CLIP) with masked
+self-supervised learning to create pre-tasks that bridge the semantic gap
+between visual and textual representations and restore masked sentences, and
+(ii) constructing an end-to-end architecture with an encoder-decoder-like
+structure that inherits the parameters of the pre-trained Visual Encoder and
+Text Decoder from the first stage. The seamless combination of these novel
+designs forms a robust sign language representation and significantly improves
+gloss-free sign language translation. In particular, we have achieved
+unprecedented improvements in terms of BLEU-4 score on the PHOENIX14T dataset
+(>+5) and the CSL-Daily dataset (>+3) compared to state-of-the-art gloss-free
+SLT methods. Furthermore, our approach also achieves competitive results on the
+PHOENIX14T dataset when compared with most of the gloss-based methods. Our code
+is available at https://github.com/zhoubenjia/GFSLT-VLP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Annotation-free Image Captioning with Retrieval-augmented
+  Pseudo Sentence Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14750v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14750v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Li, Dongnan Liu, Heng Wang, Chaoyi Zhang, Weidong Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training an image captioner without annotated image-sentence pairs has gained
+traction in recent years. Previous approaches can be categorized into two
+strategies: crawling sentences from mismatching corpora and aligning them with
+the given images as pseudo annotations, or pre-training the captioner using
+external image-text pairs. However, the aligning setting seems to reach its
+performance limit due to the quality problem of pairs, and pre-training
+requires significant computational resources. To address these challenges, we
+propose a new strategy ``LPM + retrieval-augmented learning" where the prior
+knowledge from large pre-trained models (LPMs) is leveraged as supervision, and
+a retrieval process is integrated to further reinforce its effectiveness.
+Specifically, we introduce Retrieval-augmented Pseudo Sentence Generation
+(RaPSG), which adopts an efficient approach to retrieve highly relevant short
+region descriptions from the mismatching corpora and use them to generate a
+variety of pseudo sentences with distinct representations as well as high
+quality via LPMs. In addition, a fluency filter and a CLIP-guided training
+objective are further introduced to facilitate model optimization. Experimental
+results demonstrate that our method surpasses the SOTA pre-training model
+(Flamingo3B) by achieving a CIDEr score of 78.1 (+5.1) while utilizing only
+0.3% of its trainable parameters (1.3B VS 33M). Importantly, our approach
+eliminates the need of computationally expensive pre-training processes on
+external datasets (e.g., the requirement of 312M image-text pairs for
+Flamingo3B). We further show that with a simple extension, the generated pseudo
+sentences can be deployed as weak supervision to boost the 1% semi-supervised
+image caption benchmark up to 93.4 CIDEr score (+8.9) which showcases the
+versatility and effectiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Image Completion and Enhancement using GANs <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14748v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14748v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Priyansh Saxena, Raahat Gupta, Akshat Maheshwari, Saumil Maheshwari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic inpainting or image completion alludes to the task of inferring
+arbitrary large missing regions in images based on image semantics. Since the
+prediction of image pixels requires an indication of high-level context, this
+makes it significantly tougher than image completion, which is often more
+concerned with correcting data corruption and removing entire objects from the
+input image. On the other hand, image enhancement attempts to eliminate
+unwanted noise and blur from the image, along with sustaining most of the image
+details. Efficient image completion and enhancement model should be able to
+recover the corrupted and masked regions in images and then refine the image
+further to increase the quality of the output image. Generative Adversarial
+Networks (GAN), have turned out to be helpful in picture completion tasks. In
+this chapter, we will discuss the underlying GAN architecture and how they can
+be used used for image completion tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is part of 'High-Performance Vision Intelligence'; Part of
+  the Studies in Computational Intelligence book series (SCI, volume 913) and
+  can be accessed at:
+  https://link.springer.com/chapter/10.1007/978-981-15-6844-2_11. arXiv admin
+  note: substantial text overlap with arXiv:1911.02222</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Test Time Adaptation for Blind Image Quality Assessment <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Subhadeep Roy, Shankhanil Mitra, Soma Biswas, Rajiv Soundararajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the design of blind image quality assessment (IQA) algorithms has
+improved significantly, the distribution shift between the training and testing
+scenarios often leads to a poor performance of these methods at inference time.
+This motivates the study of test time adaptation (TTA) techniques to improve
+their performance at inference time. Existing auxiliary tasks and loss
+functions used for TTA may not be relevant for quality-aware adaptation of the
+pre-trained model. In this work, we introduce two novel quality-relevant
+auxiliary tasks at the batch and sample levels to enable TTA for blind IQA. In
+particular, we introduce a group contrastive loss at the batch level and a
+relative rank loss at the sample level to make the model quality aware and
+adapt to the target data. Our experiments reveal that even using a small batch
+of images from the test distribution helps achieve significant improvement in
+performance by updating the batch normalization statistics of the source model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Silent Failures in Medical Image Classification <span class="chip">MICCAI 23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Till J. Bungert, Levin Kobelke, Paul F. Jaeger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To ensure the reliable use of classification systems in medical applications,
+it is crucial to prevent silent failures. This can be achieved by either
+designing classifiers that are robust enough to avoid failures in the first
+place, or by detecting remaining failures using confidence scoring functions
+(CSFs). A predominant source of failures in image classification is
+distribution shifts between training data and deployment data. To understand
+the current state of silent failure prevention in medical imaging, we conduct
+the first comprehensive analysis comparing various CSFs in four biomedical
+tasks and a diverse range of distribution shifts. Based on the result that none
+of the benchmarked CSFs can reliably prevent silent failures, we conclude that
+a deeper understanding of the root causes of failures in the data is required.
+To facilitate this, we introduce SF-Visuals, an interactive analysis tool that
+uses latent space clustering to visualize shifts and failures. On the basis of
+various examples, we demonstrate how this tool can help researchers gain
+insight into the requirements for safe application of classification systems in
+the medical domain. The open-source benchmark and tool are at:
+https://github.com/IML-DKFZ/sf-visuals.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ P2C: <span class="highlight-title">Self-Supervised</span> Point Cloud Completion from Single Partial Clouds <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14726v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14726v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruikai Cui, Shi Qiu, Saeed Anwar, Jiawei Liu, Chaoyue Xing, Jing Zhang, Nick Barnes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point cloud completion aims to recover the complete shape based on a partial
+observation. Existing methods require either complete point clouds or multiple
+partial observations of the same object for learning. In contrast to previous
+approaches, we present Partial2Complete (P2C), the first self-supervised
+framework that completes point cloud objects using training samples consisting
+of only a single incomplete point cloud per object. Specifically, our framework
+groups incomplete point clouds into local patches as input and predicts masked
+patches by learning prior information from different partial objects. We also
+propose Region-Aware Chamfer Distance to regularize shape mismatch without
+limiting completion capability, and devise the Normal Consistency Constraint to
+incorporate a local planarity assumption, encouraging the recovered shape
+surface to be continuous and complete. In this way, P2C no longer needs
+multiple observations or complete point clouds as ground truth. Instead,
+structural cues are learned from a category-specific dataset to complete
+partial point clouds of objects. We demonstrate the effectiveness of our
+approach on both synthetic ShapeNet data and real-world ScanNet data, showing
+that P2C produces comparable results to methods trained with complete shapes,
+and outperforms methods learned with multiple partial observations. Code is
+available at https://github.com/CuiRuikai/Partial2Complete.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ vox2vec: A Framework for <span class="highlight-title">Self-supervised</span> Contrastive Learning of
+  Voxel-level Representations in Medical Images <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mikhail Goncharov, Vera Soboleva, Anvar Kurmukov, Maxim Pisov, Mikhail Belyaev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces vox2vec - a contrastive method for self-supervised
+learning (SSL) of voxel-level representations. vox2vec representations are
+modeled by a Feature Pyramid Network (FPN): a voxel representation is a
+concatenation of the corresponding feature vectors from different pyramid
+levels. The FPN is pre-trained to produce similar representations for the same
+voxel in different augmented contexts and distinctive representations for
+different voxels. This results in unified multi-scale representations that
+capture both global semantics (e.g., body part) and local semantics (e.g.,
+different small organs or healthy versus tumor tissue). We use vox2vec to
+pre-train a FPN on more than 6500 publicly available computed tomography
+images. We evaluate the pre-trained representations by attaching simple heads
+on top of them and training the resulting models for 22 segmentation tasks. We
+show that vox2vec outperforms existing medical imaging SSL techniques in three
+evaluation setups: linear and non-linear probing and end-to-end fine-tuning.
+Moreover, a non-linear head trained on top of the frozen vox2vec
+representations achieves competitive performance with the FPN trained from
+scratch while having 50 times fewer trainable parameters. The code is available
+at https://github.com/mishgon/vox2vec .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EFLNet: Enhancing Feature Learning for Infrared Small Target Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14723v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14723v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Yang, Xinyu Zhang, Jiahao Zhu, Jian Zhang, Dongjian Tian, Jun Luo, Mingliang Zhou, Yangjun Pi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single-frame infrared small target detection is considered to be a
+challenging task, due to the extreme imbalance between target and background,
+bounding box regression is extremely sensitive to infrared small targets, and
+small target information is easy to lose in the high-level semantic layer. In
+this paper, we propose an enhancing feature learning network (EFLNet) based on
+YOLOv7 framework to solve these problems. First, we notice that there is an
+extremely imbalance between the target and the background in the infrared
+image, which makes the model pay more attention to the background features,
+resulting in missed detection. To address this problem, we propose a new
+adaptive threshold focal loss function that adjusts the loss weight
+automatically, compelling the model to allocate greater attention to target
+features. Second, we introduce the normalized Gaussian Wasserstein distance to
+alleviate the difficulty of model convergence caused by the extreme sensitivity
+of the bounding box regression to infrared small targets. Finally, we
+incorporate a dynamic head mechanism into the network to enable adaptive
+learning of the relative importance of each semantic layer. Experimental
+results demonstrate our method can achieve better performance in the detection
+performance of infrared small targets compared to state-of-the-art
+deep-learning based methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GaitMorph: Transforming Gait by Optimally Transporting Discrete Codes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Cosma, Emilian Radoi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gait, the manner of walking, has been proven to be a reliable biometric with
+uses in surveillance, marketing and security. A promising new direction for the
+field is training gait recognition systems without explicit human annotations,
+through self-supervised learning approaches. Such methods are heavily reliant
+on strong augmentations for the same walking sequence to induce more data
+variability and to simulate additional walking variations. Current data
+augmentation schemes are heuristic and cannot provide the necessary data
+variation as they are only able to provide simple temporal and spatial
+distortions. In this work, we propose GaitMorph, a novel method to modify the
+walking variation for an input gait sequence. Our method entails the training
+of a high-compression model for gait skeleton sequences that leverages
+unlabelled data to construct a discrete and interpretable latent space, which
+preserves identity-related features. Furthermore, we propose a method based on
+optimal transport theory to learn latent transport maps on the discrete
+codebook that morph gait sequences between variations. We perform extensive
+experiments and show that our method is suitable to synthesize additional views
+for an input sequence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 Tables, 6 Figures, 1 Algorithm</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pre-train</span>ing Vision <span class="highlight-title">Transformer</span>s with Very Limited Synthesized Images <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14710v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14710v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryo Nakamura1, Hirokatsu Kataoka, Sora Takashima, Edgar Josafat Martinez Noriega, Rio Yokota, Nakamasa Inoue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Formula-driven supervised learning (FDSL) is a pre-training method that
+relies on synthetic images generated from mathematical formulae such as
+fractals. Prior work on FDSL has shown that pre-training vision transformers on
+such synthetic datasets can yield competitive accuracy on a wide range of
+downstream tasks. These synthetic images are categorized according to the
+parameters in the mathematical formula that generate them. In the present work,
+we hypothesize that the process for generating different instances for the same
+category in FDSL, can be viewed as a form of data augmentation. We validate
+this hypothesis by replacing the instances with data augmentation, which means
+we only need a single image per category. Our experiments shows that this
+one-instance fractal database (OFDB) performs better than the original dataset
+where instances were explicitly generated. We further scale up OFDB to 21,000
+categories and show that it matches, or even surpasses, the model pre-trained
+on ImageNet-21k in ImageNet-1k fine-tuning. The number of images in OFDB is
+21k, whereas ImageNet-21k has 14M. This opens new possibilities for
+pre-training vision transformers with much smaller datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Taxonomy Adaptive Cross-Domain Adaptation in Medical Imaging via
+  Optimization Trajectory Distillation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14709v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14709v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianan Fan, Dongnan Liu, Hang Chang, Heng Huang, Mei Chen, Weidong Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of automated medical image analysis depends on large-scale and
+expert-annotated training sets. Unsupervised domain adaptation (UDA) has been
+raised as a promising approach to alleviate the burden of labeled data
+collection. However, they generally operate under the closed-set adaptation
+setting assuming an identical label set between the source and target domains,
+which is over-restrictive in clinical practice where new classes commonly exist
+across datasets due to taxonomic inconsistency. While several methods have been
+presented to tackle both domain shifts and incoherent label sets, none of them
+take into account the common characteristics of the two issues and consider the
+learning dynamics along network training. In this work, we propose optimization
+trajectory distillation, a unified approach to address the two technical
+challenges from a new perspective. It exploits the low-rank nature of gradient
+space and devises a dual-stream distillation algorithm to regularize the
+learning dynamics of insufficiently annotated domain and classes with the
+external guidance obtained from reliable sources. Our approach resolves the
+issue of inadequate navigation along network optimization, which is the major
+obstacle in the taxonomy adaptive cross-domain adaptation scenario. We evaluate
+the proposed method extensively on several tasks towards various endpoints with
+clinical and open-world significance. The results demonstrate its effectiveness
+and improvements over previous methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High Dynamic Range Imaging via Visual Attention Modules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14705v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14705v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Reza Omrani, Davide Moroni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thanks to High Dynamic Range (HDR) imaging methods, the scope of photography
+has seen profound changes recently. To be more specific, such methods try to
+reconstruct the lost luminosity of the real world caused by the limitation of
+regular cameras from the Low Dynamic Range (LDR) images. Additionally, although
+the State-Of-The-Art methods in this topic perform well, they mainly
+concentrate on combining different exposures and have less attention to
+extracting the informative parts of the images. Thus, this paper aims to
+introduce a new model capable of incorporating information from the most
+visible areas of each image extracted by a visual attention module (VAM), which
+is a result of a segmentation strategy. In particular, the model, based on a
+deep learning architecture, utilizes the extracted areas to produce the final
+HDR image. The results demonstrate that our method outperformed most of the
+State-Of-The-Art algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MIM-OOD: Generative Masked Image Modelling for Out-of-Distribution
+  Detection in Medical Images <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14701v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14701v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergio {Naval Marimont}, Vasilis Siomos, Giacomo Tarroni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised Out-of-Distribution (OOD) detection consists in identifying
+anomalous regions in images leveraging only models trained on images of healthy
+anatomy. An established approach is to tokenize images and model the
+distribution of tokens with Auto-Regressive (AR) models. AR models are used to
+1) identify anomalous tokens and 2) in-paint anomalous representations with
+in-distribution tokens. However, AR models are slow at inference time and prone
+to error accumulation issues which negatively affect OOD detection performance.
+Our novel method, MIM-OOD, overcomes both speed and error accumulation issues
+by replacing the AR model with two task-specific networks: 1) a transformer
+optimized to identify anomalous tokens and 2) a transformer optimized to
+in-paint anomalous tokens using masked image modelling (MIM). Our experiments
+with brain MRI anomalies show that MIM-OOD substantially outperforms AR models
+(DICE 0.458 vs 0.301) while achieving a nearly 25x speedup (9.5s vs 244s).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures. Accepted in DGM4MICCAI workshop @ MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unified Adversarial Patch for Visible-Infrared Cross-modal Attacks in
+  the Physical World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14682v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14682v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingxing Wei, Yao Huang, Yitong Sun, Jie Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physical adversarial attacks have put a severe threat to DNN-based object
+detectors. To enhance security, a combination of visible and infrared sensors
+is deployed in various scenarios, which has proven effective in disabling
+existing single-modal physical attacks. To further demonstrate the potential
+risks in such cases, we design a unified adversarial patch that can perform
+cross-modal physical attacks, achieving evasion in both modalities
+simultaneously with a single patch. Given the different imaging mechanisms of
+visible and infrared sensors, our work manipulates patches' shape features,
+which can be captured in different modalities when they undergo changes. To
+deal with challenges, we propose a novel boundary-limited shape optimization
+approach that aims to achieve compact and smooth shapes for the adversarial
+patch, making it easy to implement in the physical world. And a score-aware
+iterative evaluation method is also introduced to balance the fooling degree
+between visible and infrared detectors during optimization, which guides the
+adversarial patch to iteratively reduce the predicted scores of the multi-modal
+sensors. Furthermore, we propose an Affine-Transformation-based enhancement
+strategy that makes the learnable shape robust to various angles, thus
+mitigating the issue of shape deformation caused by different shooting angles
+in the real world. Our method is evaluated against several state-of-the-art
+object detectors, achieving an Attack Success Rate (ASR) of over 80%. We also
+demonstrate the effectiveness of our approach in physical-world scenarios under
+various settings, including different angles, distances, postures, and scenes
+for both visible and infrared sensors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 16 figures. arXiv admin note: substantial text overlap with
+  arXiv:2307.07859</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLDiffusion: Learning Degradation Representations in Diffusion Models
+  for Low-Light Image Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14659v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14659v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Wang, Kaihao Zhang, Ziqian Shao, Wenhan Luo, Bjorn Stenger, Tae-Kyun Kim, Wei Liu, Hongdong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current deep learning methods for low-light image enhancement (LLIE)
+typically rely on pixel-wise mapping learned from paired data. However, these
+methods often overlook the importance of considering degradation
+representations, which can lead to sub-optimal outcomes. In this paper, we
+address this limitation by proposing a degradation-aware learning scheme for
+LLIE using diffusion models, which effectively integrates degradation and image
+priors into the diffusion process, resulting in improved image enhancement. Our
+proposed degradation-aware learning scheme is based on the understanding that
+degradation representations play a crucial role in accurately modeling and
+capturing the specific degradation patterns present in low-light images. To
+this end, First, a joint learning framework for both image generation and image
+enhancement is presented to learn the degradation representations. Second, to
+leverage the learned degradation representations, we develop a Low-Light
+Diffusion model (LLDiffusion) with a well-designed dynamic diffusion module.
+This module takes into account both the color map and the latent degradation
+representations to guide the diffusion process. By incorporating these
+conditioning factors, the proposed LLDiffusion can effectively enhance
+low-light images, considering both the inherent degradation patterns and the
+desired color fidelity. Finally, we evaluate our proposed method on several
+well-known benchmark datasets, including synthetic and real-world unpaired
+datasets. Extensive experiments on public benchmarks demonstrate that our
+LLDiffusion outperforms state-of-the-art LLIE methods both quantitatively and
+qualitatively. The source code and pre-trained models are available at
+https://github.com/TaoWangzj/LLDiffusion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatial-Frequency U-Net for Denoising Diffusion Probabilistic Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Yuan, Linjie Li, Jianfeng Wang, Zhengyuan Yang, Kevin Lin, Zicheng Liu, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study the denoising diffusion probabilistic model (DDPM) in
+wavelet space, instead of pixel space, for visual synthesis. Considering the
+wavelet transform represents the image in spatial and frequency domains, we
+carefully design a novel architecture SFUNet to effectively capture the
+correlation for both domains. Specifically, in the standard denoising U-Net for
+pixel data, we supplement the 2D convolutions and spatial-only attention layers
+with our spatial frequency-aware convolution and attention modules to jointly
+model the complementary information from spatial and frequency domains in
+wavelet data. Our new architecture can be used as a drop-in replacement to the
+pixel-based network and is compatible with the vanilla DDPM training process.
+By explicitly modeling the wavelet signals, we find our model is able to
+generate images with higher quality on CIFAR-10, FFHQ, LSUN-Bedroom, and
+LSUN-Church datasets, than the pixel-based counterpart.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EqGAN: Feature Equalization Fusion for Few-shot Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingbo Zhou, Zhihao Yue, Yutong Ye, Pengyu Zhang, Xian Wei, Mingsong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the absence of fine structure and texture information, existing
+fusion-based few-shot image generation methods suffer from unsatisfactory
+generation quality and diversity. To address this problem, we propose a novel
+feature Equalization fusion Generative Adversarial Network (EqGAN) for few-shot
+image generation. Unlike existing fusion strategies that rely on either deep
+features or local representations, we design two separate branches to fuse
+structures and textures by disentangling encoded features into shallow and deep
+contents. To refine image contents at all feature levels, we equalize the fused
+structure and texture semantics at different scales and supplement the decoder
+with richer information by skip connections. Since the fused structures and
+textures may be inconsistent with each other, we devise a consistent
+equalization loss between the equalized features and the intermediate output of
+the decoder to further align the semantics. Comprehensive experiments on three
+public datasets demonstrate that, EqGAN not only significantly improves
+generation performance with FID score (by up to 32.7%) and LPIPS score (by up
+to 4.19%), but also outperforms the state-of-the-arts in terms of accuracy (by
+up to 1.97%) for downstream classification tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HTNet for micro-expression recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14637v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14637v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhifeng Wang, Kaihao Zhang, Wenhan Luo, Ramesh Sankaranarayana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facial expression is related to facial muscle contractions and different
+muscle movements correspond to different emotional states. For micro-expression
+recognition, the muscle movements are usually subtle, which has a negative
+impact on the performance of current facial emotion recognition algorithms.
+Most existing methods use self-attention mechanisms to capture relationships
+between tokens in a sequence, but they do not take into account the inherent
+spatial relationships between facial landmarks. This can result in sub-optimal
+performance on micro-expression recognition tasks.Therefore, learning to
+recognize facial muscle movements is a key challenge in the area of
+micro-expression recognition. In this paper, we propose a Hierarchical
+Transformer Network (HTNet) to identify critical areas of facial muscle
+movement. HTNet includes two major components: a transformer layer that
+leverages the local temporal features and an aggregation layer that extracts
+local and global semantical facial features. Specifically, HTNet divides the
+face into four different facial areas: left lip area, left eye area, right eye
+area and right lip area. The transformer layer is used to focus on representing
+local minor muscle movement with local self-attention in each area. The
+aggregation layer is used to learn the interactions between eye areas and lip
+areas. The experiments on four publicly available micro-expression datasets
+show that the proposed approach outperforms previous methods by a large margin.
+The codes and models are available at:
+\url{https://github.com/wangzhifengharrison/HTNet}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fact-Checking of AI-Generated Reports 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14634v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14634v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Razi Mahmood, Ge Wang, Mannudeep Kalra, Pingkun Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With advances in generative artificial intelligence (AI), it is now possible
+to produce realistic-looking automated reports for preliminary reads of
+radiology images. This can expedite clinical workflows, improve accuracy and
+reduce overall costs. However, it is also well-known that such models often
+hallucinate, leading to false findings in the generated reports. In this paper,
+we propose a new method of fact-checking of AI-generated reports using their
+associated images. Specifically, the developed examiner differentiates real and
+fake sentences in reports by learning the association between an image and
+sentences describing real or potentially fake findings. To train such an
+examiner, we first created a new dataset of fake reports by perturbing the
+findings in the original ground truth radiology reports associated with images.
+Text encodings of real and fake sentences drawn from these reports are then
+paired with image encodings to learn the mapping to real/fake labels. The
+utility of such an examiner is demonstrated for verifying automatically
+generated reports by detecting and removing fake sentences. Future generative
+AI approaches can use the resulting tool to validate their reports leading to a
+more responsible use of AI in expediting clinical workflows.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 360VOT: A New Benchmark <span class="highlight-title">Dataset</span> for Omnidirectional Visual Object
+  Tracking <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14630v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14630v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huajian Huang, Yinzhe Xu, Yingshu Chen, Sai-Kit Yeung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  360{\deg} images can provide an omnidirectional field of view which is
+important for stable and long-term scene perception. In this paper, we explore
+360{\deg} images for visual object tracking and perceive new challenges caused
+by large distortion, stitching artifacts, and other unique attributes of
+360{\deg} images. To alleviate these problems, we take advantage of novel
+representations of target localization, i.e., bounding field-of-view, and then
+introduce a general 360 tracking framework that can adopt typical trackers for
+omnidirectional tracking. More importantly, we propose a new large-scale
+omnidirectional tracking benchmark dataset, 360VOT, in order to facilitate
+future research. 360VOT contains 120 sequences with up to 113K high-resolution
+frames in equirectangular projection. The tracking targets cover 32 categories
+in diverse scenarios. Moreover, we provide 4 types of unbiased ground truth,
+including (rotated) bounding boxes and (rotated) bounding field-of-views, as
+well as new metrics tailored for 360{\deg} images which allow for the accurate
+evaluation of omnidirectional tracking performance. Finally, we extensively
+evaluated 20 state-of-the-art visual trackers and provided a new baseline for
+future comparisons. Homepage: https://360vot.hkustvgd.com
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Homepage: https://360vot.hkustvgd.com The toolkit of the
+  benchmark is available at: https://github.com/HuajianUP/360VOT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FS-Depth: Focal-and-Scale Depth Estimation from a Single Image in Unseen
+  Indoor Scene 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14624v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14624v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengrui Wei, Meng Yang, Lei He, Nanning Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has long been an ill-posed problem to predict absolute depth maps from
+single images in real (unseen) indoor scenes. We observe that it is essentially
+due to not only the scale-ambiguous problem but also the focal-ambiguous
+problem that decreases the generalization ability of monocular depth
+estimation. That is, images may be captured by cameras of different focal
+lengths in scenes of different scales. In this paper, we develop a
+focal-and-scale depth estimation model to well learn absolute depth maps from
+single images in unseen indoor scenes. First, a relative depth estimation
+network is adopted to learn relative depths from single images with diverse
+scales/semantics. Second, multi-scale features are generated by mapping a
+single focal length value to focal length features and concatenating them with
+intermediate features of different scales in relative depth estimation.
+Finally, relative depths and multi-scale features are jointly fed into an
+absolute depth estimation network. In addition, a new pipeline is developed to
+augment the diversity of focal lengths of public datasets, which are often
+captured with cameras of the same or similar focal lengths. Our model is
+trained on augmented NYUDv2 and tested on three unseen datasets. Our model
+considerably improves the generalization ability of depth estimation by 41%/13%
+(RMSE) with/without data augmentation compared with five recent SOTAs and well
+alleviates the deformation problem in 3D reconstruction. Notably, our model
+well maintains the accuracy of depth estimation on original NYUDv2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NeRF-Det: Learning Geometry-Aware Volumetric Representation for
+  Multi-View 3D Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14620v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14620v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenfeng Xu, Bichen Wu, Ji Hou, Sam Tsai, Ruilong Li, Jialiang Wang, Wei Zhan, Zijian He, Peter Vajda, Kurt Keutzer, Masayoshi Tomizuka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present NeRF-Det, a novel method for indoor 3D detection with posed RGB
+images as input. Unlike existing indoor 3D detection methods that struggle to
+model scene geometry, our method makes novel use of NeRF in an end-to-end
+manner to explicitly estimate 3D geometry, thereby improving 3D detection
+performance. Specifically, to avoid the significant extra latency associated
+with per-scene optimization of NeRF, we introduce sufficient geometry priors to
+enhance the generalizability of NeRF-MLP. Furthermore, we subtly connect the
+detection and NeRF branches through a shared MLP, enabling an efficient
+adaptation of NeRF to detection and yielding geometry-aware volumetric
+representations for 3D detection. Our method outperforms state-of-the-arts by
+3.9 mAP and 3.1 mAP on the ScanNet and ARKITScenes benchmarks, respectively. We
+provide extensive analysis to shed light on how NeRF-Det works. As a result of
+our joint-training design, NeRF-Det is able to generalize well to unseen scenes
+for object detection, view synthesis, and depth estimation tasks without
+requiring per-scene optimization. Code is available at
+\url{https://github.com/facebookresearch/NeRF-Det}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiscale Dynamic Graph Representation for Biometric Recognition with
+  Occlusions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Ren, Yunlong Wang, Yuhao Zhu, Kunbo Zhang, Zhenan Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Occlusion is a common problem with biometric recognition in the wild. The
+generalization ability of CNNs greatly decreases due to the adverse effects of
+various occlusions. To this end, we propose a novel unified framework
+integrating the merits of both CNNs and graph models to overcome occlusion
+problems in biometric recognition, called multiscale dynamic graph
+representation (MS-DGR). More specifically, a group of deep features reflected
+on certain subregions is recrafted into a feature graph (FG). Each node inside
+the FG is deemed to characterize a specific local region of the input sample,
+and the edges imply the co-occurrence of non-occluded regions. By analyzing the
+similarities of the node representations and measuring the topological
+structures stored in the adjacent matrix, the proposed framework leverages
+dynamic graph matching to judiciously discard the nodes corresponding to the
+occluded parts. The multiscale strategy is further incorporated to attain more
+diverse nodes representing regions of various sizes. Furthermore, the proposed
+framework exhibits a more illustrative and reasonable inference by showing the
+paired nodes. Extensive experiments demonstrate the superiority of the proposed
+framework, which boosts the accuracy in both natural and occlusion-simulated
+cases by a large margin compared with that of baseline methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TPAMI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenCo: An Auxiliary Generator from Contrastive Learning for Enhanced
+  Few-Shot Learning in Remote Sensing <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14612v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14612v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Wu, Naira Hovakimyan, Jennifer Hobbs
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classifying and segmenting patterns from a limited number of examples is a
+significant challenge in remote sensing and earth observation due to the
+difficulty in acquiring accurately labeled data in large quantities. Previous
+studies have shown that meta-learning, which involves episodic training on
+query and support sets, is a promising approach. However, there has been little
+attention paid to direct fine-tuning techniques. This paper repurposes
+contrastive learning as a pre-training method for few-shot learning for
+classification and semantic segmentation tasks. Specifically, we introduce a
+generator-based contrastive learning framework (GenCo) that pre-trains
+backbones and simultaneously explores variants of feature samples. In
+fine-tuning, the auxiliary generator can be used to enrich limited labeled data
+samples in feature space. We demonstrate the effectiveness of our method in
+improving few-shot learning performance on two key remote sensing datasets:
+Agriculture-Vision and EuroSAT. Empirically, our approach outperforms purely
+supervised training on the nearly 95,000 images in Agriculture-Vision for both
+classification and semantic segmentation tasks. Similarly, the proposed
+few-shot method achieves better results on the land-cover classification task
+on EuroSAT compared to the results obtained from fully supervised model
+training on the dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>European Conference on Artificial Intelligence (ECAI), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TextManiA: Enriching Visual Feature by Text-driven Manifold Augmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14611v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14611v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moon Ye-Bin, Jisoo Kim, Hongyeob Kim, Kilho Son, Tae-Hyun Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent label mix-based augmentation methods have shown their effectiveness in
+generalization despite their simplicity, and their favorable effects are often
+attributed to semantic-level augmentation. However, we found that they are
+vulnerable to highly skewed class distribution, because scarce data classes are
+rarely sampled for inter-class perturbation. We propose TextManiA, a
+text-driven manifold augmentation method that semantically enriches visual
+feature spaces, regardless of data distribution. TextManiA augments visual data
+with intra-class semantic perturbation by exploiting easy-to-understand
+visually mimetic words, i.e., attributes. To this end, we bridge between the
+text representation and a target visual feature space, and propose an efficient
+vector augmentation. To empirically support the validity of our design, we
+devise two visualization-based analyses and show the plausibility of the bridge
+between two different modality spaces. Our experiments demonstrate that
+TextManiA is powerful in scarce samples with class imbalance as well as even
+distribution. We also show compatibility with the label mix-based approaches in
+evenly distributed scarce data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Clustering based Point Cloud Representation Learning for 3D Analysis <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tuo Feng, Wenguan Wang, Xiaohan Wang, Yi Yang, Qinghua Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point cloud analysis (such as 3D segmentation and detection) is a challenging
+task, because of not only the irregular geometries of many millions of
+unordered points, but also the great variations caused by depth, viewpoint,
+occlusion, etc. Current studies put much focus on the adaption of neural
+networks to the complex geometries of point clouds, but are blind to a
+fundamental question: how to learn an appropriate point embedding space that is
+aware of both discriminative semantics and challenging variations? As a
+response, we propose a clustering based supervised learning scheme for point
+cloud analysis. Unlike current de-facto, scene-wise training paradigm, our
+algorithm conducts within-class clustering on the point embedding space for
+automatically discovering subclass patterns which are latent yet representative
+across scenes. The mined patterns are, in turn, used to repaint the embedding
+space, so as to respect the underlying distribution of the entire training
+dataset and improve the robustness to the variations. Our algorithm is
+principled and readily pluggable to modern point cloud segmentation networks
+during training, without extra overhead during testing. With various 3D network
+architectures (i.e., voxel-based, point-based, Transformer-based, automatically
+searched), our algorithm shows notable improvements on famous point cloud
+segmentation datasets (i.e.,2.0-2.6% on single-scan and 2.0-2.2% multi-scan of
+SemanticKITTI, 1.8-1.9% on S3DIS, in terms of mIoU). Our algorithm also
+demonstrates utility in 3D detection, showing 2.0-3.4% mAP gains on KITTI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023; Project page:
+  https://github.com/FengZicai/Cluster3Dseg/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Weakly Supervised Segmentation Network Embedding Cross-scale Attention
+  Guidance and Noise-sensitive Constraint for Detecting Tertiary Lymphoid
+  Structures of Pancreatic Tumors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14603v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14603v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingxue Wang, Liwen Zou, Jun Chen, Yingying Cao, Zhenghua Cai, Yudong Qiu, Liang Mao, Zhongqiu Wang, Jingya Chen, Luying Gui, Xiaoping Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The presence of tertiary lymphoid structures (TLSs) on pancreatic
+pathological images is an important prognostic indicator of pancreatic tumors.
+Therefore, TLSs detection on pancreatic pathological images plays a crucial
+role in diagnosis and treatment for patients with pancreatic tumors. However,
+fully supervised detection algorithms based on deep learning usually require a
+large number of manual annotations, which is time-consuming and
+labor-intensive. In this paper, we aim to detect the TLSs in a manner of
+few-shot learning by proposing a weakly supervised segmentation network. We
+firstly obtain the lymphocyte density maps by combining a pretrained model for
+nuclei segmentation and a domain adversarial network for lymphocyte nuclei
+recognition. Then, we establish a cross-scale attention guidance mechanism by
+jointly learning the coarse-scale features from the original histopathology
+images and fine-scale features from our designed lymphocyte density attention.
+A noise-sensitive constraint is introduced by an embedding signed distance
+function loss in the training procedure to reduce tiny prediction errors.
+Experimental results on two collected datasets demonstrate that our proposed
+method significantly outperforms the state-of-the-art segmentation-based
+algorithms in terms of TLSs detection accuracy. Additionally, we apply our
+method to study the congruent relationship between the density of TLSs and
+peripancreatic vascular invasion and obtain some clinically statistical
+results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FakeTracer: Proactively Defending Against Face-swap DeepFakes via
+  Implanting Traces in Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14593v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14593v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pu Sun, Honggang Qi, Yuezun Li, Siwei Lyu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face-swap DeepFake is an emerging AI-based face forgery technique that can
+replace the original face in a video with a generated face of the target
+identity while retaining consistent facial attributes such as expression and
+orientation. Due to the high privacy of faces, the misuse of this technique can
+raise severe social concerns, drawing tremendous attention to defend against
+DeepFakes recently. In this paper, we describe a new proactive defense method
+called FakeTracer to expose face-swap DeepFakes via implanting traces in
+training. Compared to general face-synthesis DeepFake, the face-swap DeepFake
+is more complex as it involves identity change, is subjected to the
+encoding-decoding process, and is trained unsupervised, increasing the
+difficulty of implanting traces into the training phase. To effectively defend
+against face-swap DeepFake, we design two types of traces, sustainable trace
+(STrace) and erasable trace (ETrace), to be added to training faces. During the
+training, these manipulated faces affect the learning of the face-swap DeepFake
+model, enabling it to generate faces that only contain sustainable traces. In
+light of these two traces, our method can effectively expose DeepFakes by
+identifying them. Extensive experiments are conducted on the Celeb-DF dataset,
+compared with recent passive and proactive defense methods, and are studied
+thoroughly regarding various factors, corroborating the efficacy of our method
+on defending against face-swap DeepFake.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The detection and rectification for identity-switch based on unfalsified
+  control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14591v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14591v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junchao Huang, Xiaoqi He, Sheng Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The purpose of multi-object tracking (MOT) is to continuously track and
+identify objects detected in videos. Currently, most methods for multi-object
+tracking model the motion information and combine it with appearance
+information to determine and track objects. In this paper, unfalsified control
+is employed to address the ID-switch problem in multi-object tracking. We
+establish sequences of appearance information variations for the trajectories
+during the tracking process and design a detection and rectification module
+specifically for ID-switch detection and recovery. We also propose a simple and
+effective strategy to address the issue of ambiguous matching of appearance
+information during the data association process. Experimental results on
+publicly available MOT datasets demonstrate that the tracker exhibits excellent
+effectiveness and robustness in handling tracking errors caused by occlusions
+and rapid movements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MCPA: Multi-scale Cross Perceptron Attention Network for 2D Medical
+  Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Xu, Mingxiao Chen, Yi Cheng, Pengfei Shao, Shuwei Shen, Peng Yao, Ronald X. Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The UNet architecture, based on Convolutional Neural Networks (CNN), has
+demonstrated its remarkable performance in medical image analysis. However, it
+faces challenges in capturing long-range dependencies due to the limited
+receptive fields and inherent bias of convolutional operations. Recently,
+numerous transformer-based techniques have been incorporated into the UNet
+architecture to overcome this limitation by effectively capturing global
+feature correlations. However, the integration of the Transformer modules may
+result in the loss of local contextual information during the global feature
+fusion process. To overcome these challenges, we propose a 2D medical image
+segmentation model called Multi-scale Cross Perceptron Attention Network
+(MCPA). The MCPA consists of three main components: an encoder, a decoder, and
+a Cross Perceptron. The Cross Perceptron first captures the local correlations
+using multiple Multi-scale Cross Perceptron modules, facilitating the fusion of
+features across scales. The resulting multi-scale feature vectors are then
+spatially unfolded, concatenated, and fed through a Global Perceptron module to
+model global dependencies. Furthermore, we introduce a Progressive Dual-branch
+Structure to address the semantic segmentation of the image involving finer
+tissue structures. This structure gradually shifts the segmentation focus of
+MCPA network training from large-scale structural features to more
+sophisticated pixel-level features. We evaluate our proposed MCPA model on
+several publicly available medical image datasets from different tasks and
+devices, including the open large-scale dataset of CT (Synapse), MRI (ACDC),
+fundus camera (DRIVE, CHASE_DB1, HRF), and OCTA (ROSE). The experimental
+results show that our MCPA model achieves state-of-the-art performance. The
+code is available at
+https://github.com/simonustc/MCPA-for-2D-Medical-Image-Segmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Representation-Based Method for Metal-induced Artifact Reduction
+  in Dental CBCT Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14579v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14579v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyoung Suk Park, Kiwan Jeon, Jin Keun Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces a novel reconstruction method for dental cone-beam
+computed tomography (CBCT), focusing on effectively reducing metal-induced
+artifacts commonly encountered in the presence of prevalent metallic implants.
+Despite significant progress in metal artifact reduction techniques, challenges
+persist owing to the intricate physical interactions between polychromatic
+X-ray beams and metal objects, which are further compounded by the additional
+effects associated with metal-tooth interactions and factors specific to the
+dental CBCT data environment. To overcome these limitations, we propose an
+implicit neural network that generates two distinct and informative tomographic
+images. One image represents the monochromatic attenuation distribution at a
+specific energy level, whereas the other captures the nonlinear beam-hardening
+factor resulting from the polychromatic nature of X-ray beams. In contrast to
+existing CT reconstruction techniques, the proposed method relies exclusively
+on the Beer--Lambert law, effectively preventing the generation of
+metal-induced artifacts during the backprojection process commonly implemented
+in conventional methods. Extensive experimental evaluations demonstrate that
+the proposed method effectively reduces metal artifacts while providing
+high-quality image reconstructions, thus emphasizing the significance of the
+second image in capturing the nonlinear beam-hardening factor.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GADER: GAit DEtection and Recognition in the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14578v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14578v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxiang Guo, Cheng Peng, Ram Prabhakar, Chun Pong Lau, Rama Chellappa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gait recognition holds the promise of robustly identifying subjects based on
+their walking patterns instead of color information. While previous approaches
+have performed well for curated indoor scenes, they have significantly impeded
+applicability in unconstrained situations, e.g. outdoor, long distance scenes.
+We propose an end-to-end GAit DEtection and Recognition (GADER) algorithm for
+human authentication in challenging outdoor scenarios. Specifically, GADER
+leverages a Double Helical Signature to detect the fragment of human movement
+and incorporates a novel gait recognition method, which learns representations
+by distilling from an auxiliary RGB recognition model. At inference time, GADER
+only uses the silhouette modality but benefits from a more robust
+representation. Extensive experiments on indoor and outdoor datasets
+demonstrate that the proposed method outperforms the State-of-The-Arts for gait
+recognition and verification, with a significant 20.6% improvement on
+unconstrained, long distance scenes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Memory-Augmented Multi-Task Collaborative Framework for Unsupervised
+  Traffic Accident Detection in Driving Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongqin Liang, Yuanman Li, Yingxin Yi, Jiantao Zhou, Xia Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying traffic accidents in driving videos is crucial to ensuring the
+safety of autonomous driving and driver assistance systems. To address the
+potential danger caused by the long-tailed distribution of driving events,
+existing traffic accident detection (TAD) methods mainly rely on unsupervised
+learning. However, TAD is still challenging due to the rapid movement of
+cameras and dynamic scenes in driving scenarios. Existing unsupervised TAD
+methods mainly rely on a single pretext task, i.e., an appearance-based or
+future object localization task, to detect accidents. However, appearance-based
+approaches are easily disturbed by the rapid movement of the camera and changes
+in illumination, which significantly reduce the performance of traffic accident
+detection. Methods based on future object localization may fail to capture
+appearance changes in video frames, making it difficult to detect ego-involved
+accidents (e.g., out of control of the ego-vehicle). In this paper, we propose
+a novel memory-augmented multi-task collaborative framework (MAMTCF) for
+unsupervised traffic accident detection in driving videos. Different from
+previous approaches, our method can more accurately detect both ego-involved
+and non-ego accidents by simultaneously modeling appearance changes and object
+motions in video frames through the collaboration of optical flow
+reconstruction and future object localization tasks. Further, we introduce a
+memory-augmented motion representation mechanism to fully explore the
+interrelation between different types of motion representations and exploit the
+high-level features of normal traffic patterns stored in memory to augment
+motion representations, thus enlarging the difference from anomalies.
+Experimental results on recently published large-scale dataset demonstrate that
+our method achieves better performance compared to previous state-of-the-art
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12pages,5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Detection, Assocation, and Localization of Vehicle Lights: A
+  Context-Based Cascaded CNN Approach and Evaluations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14571v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14571v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akshay Gopalkrishnan, Ross Greer, Maitrayee Keskar, Mohan Trivedi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vehicle light detection is required for important downstream safe autonomous
+driving tasks, such as predicting a vehicle's light state to determine if the
+vehicle is making a lane change or turning. Currently, many vehicle light
+detectors use single-stage detectors which predict bounding boxes to identify a
+vehicle light, in a manner decoupled from vehicle instances. In this paper, we
+present a method for detecting a vehicle light given an upstream vehicle
+detection and approximation of a visible light's center. Our method predicts
+four approximate corners associated with each vehicle light. We experiment with
+CNN architectures, data augmentation, and contextual preprocessing methods
+designed to reduce surrounding-vehicle confusion. We achieve an average
+distance error from the ground truth corner of 5.09 pixels, about 17.24% of the
+size of the vehicle light on average. We train and evaluate our model on the
+LISA Lights dataset, allowing us to thoroughly evaluate our vehicle light
+corner detection model on a large variety of vehicle light shapes and lighting
+conditions. We propose that this model can be integrated into a pipeline with
+vehicle detection and vehicle light center detection to make a fully-formed
+vehicle light detection network, valuable to identifying trajectory-informative
+signals in driving scenes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physically Plausible 3D Human-Scene Reconstruction from Monocular RGB
+  Image using an Adversarial Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14570v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14570v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sandika Biswas, Kejie Li, Biplab Banerjee, Subhasis Chaudhuri, Hamid Rezatofighi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Holistic 3D human-scene reconstruction is a crucial and emerging research
+area in robot perception. A key challenge in holistic 3D human-scene
+reconstruction is to generate a physically plausible 3D scene from a single
+monocular RGB image. The existing research mainly proposes optimization-based
+approaches for reconstructing the scene from a sequence of RGB frames with
+explicitly defined physical laws and constraints between different scene
+elements (humans and objects). However, it is hard to explicitly define and
+model every physical law in every scenario. This paper proposes using an
+implicit feature representation of the scene elements to distinguish a
+physically plausible alignment of humans and objects from an implausible one.
+We propose using a graph-based holistic representation with an encoded physical
+representation of the scene to analyze the human-object and object-object
+interactions within the scene. Using this graphical representation, we
+adversarially train our model to learn the feasible alignments of the scene
+elements from the training data itself without explicitly defining the laws and
+constraints between them. Unlike the existing inference-time optimization-based
+approaches, we use this adversarially trained model to produce a per-frame 3D
+reconstruction of the scene that abides by the physical laws and constraints.
+Our learning-based method achieves comparable 3D reconstruction quality to
+existing optimization-based holistic human-scene reconstruction methods and
+does not need inference time optimization. This makes it better suited when
+compared to existing methods, for potential use in robotic applications, such
+as robot navigation, etc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in RAL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Dust Sand Image Enhancement Based on Color Correction and New
+  Membership Function 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15230v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15230v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Hakem Alsaeedi, Suha Mohammed Hadi, Yarub Alazzawi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Images captured in dusty environments suffering from poor visibility and
+quality. Enhancement of these images such as sand dust images plays a critical
+role in various atmospheric optics applications. In this work, proposed a new
+model based on Color Correction and new membership function to enhance san dust
+images. The proposed model consists of three phases: correction of color shift,
+removal of haze, and enhancement of contrast and brightness. The color shift is
+corrected using a new membership function to adjust the values of U and V in
+the YUV color space. The Adaptive Dark Channel Prior (A-DCP) is used for haze
+removal. The stretching contrast and improving image brightness are based on
+Contrast Limited Adaptive Histogram Equalization (CLAHE). The proposed model
+tests and evaluates through many real sand dust images. The experimental
+results show that the proposed solution is outperformed the current studies in
+terms of effectively removing the red and yellow cast and provides high quality
+and quantity dust images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Multi-modal Representations by Watching Hundreds of Surgical
+  Video Lectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15220v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15220v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kun Yuan, Vinkle Srivastav, Tong Yu, Joel Lavanchy, Pietro Mascagni, Nassir Navab, Nicolas Padoy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in surgical computer vision applications have been driven
+by fully-supervised methods, primarily using only visual data. These methods
+rely on manually annotated surgical videos to predict a fixed set of object
+categories, limiting their generalizability to unseen surgical procedures and
+downstream tasks. In this work, we put forward the idea that the surgical video
+lectures available through open surgical e-learning platforms can provide
+effective supervisory signals for multi-modal representation learning without
+relying on manual annotations. We address the surgery-specific linguistic
+challenges present in surgical video lectures by employing multiple
+complementary automatic speech recognition systems to generate text
+transcriptions. We then present a novel method, SurgVLP - Surgical Vision
+Language Pre-training, for multi-modal representation learning. SurgVLP
+constructs a new contrastive learning objective to align video clip embeddings
+with the corresponding multiple text embeddings by bringing them together
+within a joint latent space. To effectively show the representation capability
+of the learned joint latent space, we introduce several vision-and-language
+tasks for surgery, such as text-based video retrieval, temporal activity
+grounding, and video captioning, as benchmarks for evaluation. We further
+demonstrate that without using any labeled ground truth, our approach can be
+employed for traditional vision-only surgical downstream tasks, such as
+surgical tool, phase, and triplet recognition. The code will be made available
+at https://github.com/CAMMA-public/SurgVLP
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative AI for Medical Imaging: extending the MONAI Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15208v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15208v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Walter H. L. Pinaya, Mark S. Graham, Eric Kerfoot, Petru-Daniel Tudosiu, Jessica Dafflon, Virginia Fernandez, Pedro Sanchez, Julia Wolleb, Pedro F. da Costa, Ashay Patel, Hyungjin Chung, Can Zhao, Wei Peng, Zelong Liu, Xueyan Mei, Oeslle Lucena, Jong Chul Ye, Sotirios A. Tsaftaris, Prerna Dogra, Andrew Feng, Marc Modat, Parashkev Nachev, Sebastien Ourselin, M. Jorge Cardoso
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in generative AI have brought incredible breakthroughs in
+several areas, including medical imaging. These generative models have
+tremendous potential not only to help safely share medical data via synthetic
+datasets but also to perform an array of diverse applications, such as anomaly
+detection, image-to-image translation, denoising, and MRI reconstruction.
+However, due to the complexity of these models, their implementation and
+reproducibility can be difficult. This complexity can hinder progress, act as a
+use barrier, and dissuade the comparison of new methods with existing works. In
+this study, we present MONAI Generative Models, a freely available open-source
+platform that allows researchers and developers to easily train, evaluate, and
+deploy generative models and related applications. Our platform reproduces
+state-of-art studies in a standardised way involving different architectures
+(such as diffusion models, autoregressive transformers, and GANs), and provides
+pre-trained models for the community. We have implemented these models in a
+generalisable fashion, illustrating that their results can be extended to 2D or
+3D scenarios, including medical images with different modalities (like CT, MRI,
+and X-Ray data) and from different anatomical areas. Finally, we adopt a
+modular and extensible approach, ensuring long-term maintainability and the
+extension of current applications for future features.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>Styler: <span class="highlight-title">Prompt</span>-driven Style Generation for Source-free Domain
+  Generalization <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhyeong Cho, Gilhyun Nam, Sungyeon Kim, Hunmin Yang, Suha Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a joint vision-language space, a text feature (e.g., from "a photo of a
+dog") could effectively represent its relevant image features (e.g., from dog
+photos). Inspired by this, we propose PromptStyler which simulates various
+distribution shifts in the joint space by synthesizing diverse styles via
+prompts without using any images to deal with source-free domain
+generalization. Our method learns to generate a variety of style features (from
+"a S* style of a") via learnable style word vectors for pseudo-words S*. To
+ensure that learned styles do not distort content information, we force
+style-content features (from "a S* style of a [class]") to be located nearby
+their corresponding content features (from "[class]") in the joint
+vision-language space. After learning style word vectors, we train a linear
+classifier using synthesized style-content features. PromptStyler achieves the
+state of the art on PACS, VLCS, OfficeHome and DomainNet, although it does not
+require any images and takes just ~30 minutes for training using a single GPU.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023, Project Page: https://promptstyler.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ One-shot Joint Extraction, Registration and Segmentation of Neuroimaging
+  Data <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15198v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15198v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Su, Zhentian Qian, Lei Ma, Lifang He, Xiangnan Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain extraction, registration and segmentation are indispensable
+preprocessing steps in neuroimaging studies. The aim is to extract the brain
+from raw imaging scans (i.e., extraction step), align it with a target brain
+image (i.e., registration step) and label the anatomical brain regions (i.e.,
+segmentation step). Conventional studies typically focus on developing separate
+methods for the extraction, registration and segmentation tasks in a supervised
+setting. The performance of these methods is largely contingent on the quantity
+of training samples and the extent of visual inspections carried out by experts
+for error correction. Nevertheless, collecting voxel-level labels and
+performing manual quality control on high-dimensional neuroimages (e.g., 3D
+MRI) are expensive and time-consuming in many medical studies. In this paper,
+we study the problem of one-shot joint extraction, registration and
+segmentation in neuroimaging data, which exploits only one labeled template
+image (a.k.a. atlas) and a few unlabeled raw images for training. We propose a
+unified end-to-end framework, called JERS, to jointly optimize the extraction,
+registration and segmentation tasks, allowing feedback among them.
+Specifically, we use a group of extraction, registration and segmentation
+modules to learn the extraction mask, transformation and segmentation mask,
+where modules are interconnected and mutually reinforced by self-supervision.
+Empirical results on real-world datasets demonstrate that our proposed method
+performs exceptionally in the extraction, registration and segmentation tasks.
+Our code and data can be found at https://github.com/Anonymous4545/JERS
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a research track paper at KDD 2023. Code:
+  https://github.com/Anonymous4545/JERS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Small, but important: Traffic light proposals for detecting small
+  traffic lights and beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15191v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15191v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Sanitz, Christian Wilms, Simone Frintrop
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic light detection is a challenging problem in the context of
+self-driving cars and driver assistance systems. While most existing systems
+produce good results on large traffic lights, detecting small and tiny ones is
+often overlooked. A key problem here is the inherent downsampling in CNNs,
+leading to low-resolution features for detection. To mitigate this problem, we
+propose a new traffic light detection system, comprising a novel traffic light
+proposal generator that utilizes findings from general object proposal
+generation, fine-grained multi-scale features, and attention for efficient
+processing. Moreover, we design a new detection head for classifying and
+refining our proposals. We evaluate our system on three challenging, publicly
+available datasets and compare it against six methods. The results show
+substantial improvements of at least $12.6\%$ on small and tiny traffic lights,
+as well as strong results across all sizes of traffic lights.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICVS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Med-Flamingo: a Multimodal Medical Few-shot Learner 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15189v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15189v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Moor, Qian Huang, Shirley Wu, Michihiro Yasunaga, Cyril Zakka, Yash Dalmia, Eduardo Pontes Reis, Pranav Rajpurkar, Jure Leskovec
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medicine, by its nature, is a multifaceted domain that requires the synthesis
+of information across various modalities. Medical generative vision-language
+models (VLMs) make a first step in this direction and promise many exciting
+clinical applications. However, existing models typically have to be fine-tuned
+on sizeable down-stream datasets, which poses a significant limitation as in
+many medical applications data is scarce, necessitating models that are capable
+of learning from few examples in real-time. Here we propose Med-Flamingo, a
+multimodal few-shot learner adapted to the medical domain. Based on
+OpenFlamingo-9B, we continue pre-training on paired and interleaved medical
+image-text data from publications and textbooks. Med-Flamingo unlocks few-shot
+generative medical visual question answering (VQA) abilities, which we evaluate
+on several datasets including a novel challenging open-ended VQA dataset of
+visual USMLE-style problems. Furthermore, we conduct the first human evaluation
+for generative medical VQA where physicians review the problems and blinded
+generations in an interactive app. Med-Flamingo improves performance in
+generative medical VQA by up to 20\% in clinician's rating and firstly enables
+multimodal medical few-shot adaptations, such as rationale generation. We
+release our model, code, and evaluation app under
+https://github.com/snap-stanford/med-flamingo.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EnSolver: Uncertainty-Aware CAPTCHA Solver Using Deep Ensembles <span class="chip">UAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duc C. Hoang, Cuong V. Nguyen, Amin Kharraz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The popularity of text-based CAPTCHA as a security mechanism to protect
+websites from automated bots has prompted researches in CAPTCHA solvers, with
+the aim of understanding its failure cases and subsequently making CAPTCHAs
+more secure. Recently proposed solvers, built on advances in deep learning, are
+able to crack even the very challenging CAPTCHAs with high accuracy. However,
+these solvers often perform poorly on out-of-distribution samples that contain
+visual features different from those in the training set. Furthermore, they
+lack the ability to detect and avoid such samples, making them susceptible to
+being locked out by defense systems after a certain number of failed attempts.
+In this paper, we propose EnSolver, a novel CAPTCHA solver that utilizes deep
+ensemble uncertainty estimation to detect and skip out-of-distribution
+CAPTCHAs, making it harder to be detected. We demonstrate the use of our solver
+with object detection models and show empirically that it performs well on both
+in-distribution and out-of-distribution data, achieving up to 98.1% accuracy
+when detecting out-of-distribution data and up to 93% success rate when solving
+in-distribution CAPTCHAs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Epistemic Uncertainty - E-pi UAI 2023 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ R-LPIPS: An Adversarially Robust Perceptual Similarity Metric 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15157v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15157v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Ghazanfari, Siddharth Garg, Prashanth Krishnamurthy, Farshad Khorrami, Alexandre Araujo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Similarity metrics have played a significant role in computer vision to
+capture the underlying semantics of images. In recent years, advanced
+similarity metrics, such as the Learned Perceptual Image Patch Similarity
+(LPIPS), have emerged. These metrics leverage deep features extracted from
+trained neural networks and have demonstrated a remarkable ability to closely
+align with human perception when evaluating relative image similarity. However,
+it is now well-known that neural networks are susceptible to adversarial
+examples, i.e., small perturbations invisible to humans crafted to deliberately
+mislead the model. Consequently, the LPIPS metric is also sensitive to such
+adversarial examples. This susceptibility introduces significant security
+concerns, especially considering the widespread adoption of LPIPS in
+large-scale applications. In this paper, we propose the Robust Learned
+Perceptual Image Patch Similarity (R-LPIPS) metric, a new metric that leverages
+adversarially trained deep features. Through a comprehensive set of
+experiments, we demonstrate the superiority of R-LPIPS compared to the
+classical LPIPS metric. The code is available at
+\url{https://github.com/SaraGhazanfari/R-LPIPS}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ R-Block: Regularized Block of Dropout for convolutional networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15150v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15150v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liqi Wang, Qiya Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dropout as a regularization technique is widely used in fully connected
+layers while is less effective in convolutional layers. Therefore more
+structured forms of dropout have been proposed to regularize convolutional
+networks. The disadvantage of these methods is that the randomness introduced
+causes inconsistency between training and inference. In this paper, we apply a
+mutual learning training strategy for convolutional layer regularization,
+namely R-Block, which forces two outputs of the generated difference maximizing
+sub models to be consistent with each other. Concretely, R-Block minimizes the
+losses between the output distributions of two sub models with different drop
+regions for each sample in the training dataset. We design two approaches to
+construct such sub models. Our experiments demonstrate that R-Block achieves
+better performance than other existing structured dropout variants. We also
+demonstrate that our approaches to construct sub models outperforms others.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Clustered Codebook 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15139v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15139v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanxia Zheng, Andrea Vedaldi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vector Quantisation (VQ) is experiencing a comeback in machine learning,
+where it is increasingly used in representation learning. However, optimizing
+the codevectors in existing VQ-VAE is not entirely trivial. A problem is
+codebook collapse, where only a small subset of codevectors receive gradients
+useful for their optimisation, whereas a majority of them simply ``dies off''
+and is never updated or used. This limits the effectiveness of VQ for learning
+larger codebooks in complex computer vision tasks that require high-capacity
+representations. In this paper, we present a simple alternative method for
+online codebook learning, Clustering VQ-VAE (CVQ-VAE). Our approach selects
+encoded features as anchors to update the ``dead'' codevectors, while
+optimising the codebooks which are alive via the original loss. This strategy
+brings unused codevectors closer in distribution to the encoded features,
+increasing the likelihood of being chosen and optimized. We extensively
+validate the generalization capability of our quantiser on various datasets,
+tasks (e.g. reconstruction and generation), and architectures (e.g. VQ-VAE,
+VQGAN, LDM). Our CVQ-VAE can be easily integrated into the existing models with
+just a few lines of code.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The project page: https://chuanxiaz.com/cvq/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Seal-3D: Interactive Pixel-Level Editing for Neural Radiance Fields <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15131v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15131v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyu Wang, Jingsen Zhu, Qi Ye, Yuchi Huo, Yunlong Ran, Zhihua Zhong, Jiming Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the popularity of implicit neural representations, or neural radiance
+fields (NeRF), there is a pressing need for editing methods to interact with
+the implicit 3D models for tasks like post-processing reconstructed scenes and
+3D content creation. While previous works have explored NeRF editing from
+various perspectives, they are restricted in editing flexibility, quality, and
+speed, failing to offer direct editing response and instant preview. The key
+challenge is to conceive a locally editable neural representation that can
+directly reflect the editing instructions and update instantly. To bridge the
+gap, we propose a new interactive editing method and system for implicit
+representations, called Seal-3D, which allows users to edit NeRF models in a
+pixel-level and free manner with a wide range of NeRF-like backbone and preview
+the editing effects instantly. To achieve the effects, the challenges are
+addressed by our proposed proxy function mapping the editing instructions to
+the original space of NeRF models and a teacher-student training strategy with
+local pretraining and global finetuning. A NeRF editing system is built to
+showcase various editing types. Our system can achieve compelling editing
+effects with an interactive speed of about 1 second.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023. Project Page:
+  https://windingwind.github.io/seal-3d/ Code:
+  https://github.com/windingwind/seal-3d/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ End-to-end Remote Sensing Change Detection of Unregistered Bi-temporal
+  Images for Natural Disasters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15128v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15128v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guiqin Zhao, Lianlei Shan, Weiqiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Change detection based on remote sensing images has been a prominent area of
+interest in the field of remote sensing. Deep networks have demonstrated
+significant success in detecting changes in bi-temporal remote sensing images
+and have found applications in various fields. Given the degradation of natural
+environments and the frequent occurrence of natural disasters, accurately and
+swiftly identifying damaged buildings in disaster-stricken areas through remote
+sensing images holds immense significance. This paper aims to investigate
+change detection specifically for natural disasters. Considering that existing
+public datasets used in change detection research are registered, which does
+not align with the practical scenario where bi-temporal images are not matched,
+this paper introduces an unregistered end-to-end change detection synthetic
+dataset called xBD-E2ECD. Furthermore, we propose an end-to-end change
+detection network named E2ECDNet, which takes an unregistered bi-temporal image
+pair as input and simultaneously generates the flow field prediction result and
+the change detection prediction result. It is worth noting that our E2ECDNet
+also supports change detection for registered image pairs, as registration can
+be seen as a special case of non-registration. Additionally, this paper
+redefines the criteria for correctly predicting a positive case and introduces
+neighborhood-based change detection evaluation metrics. The experimental
+results have demonstrated significant improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Morphing Attacks via Continual Incremental Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15105v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15105v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Pellegrini, Guido Borghi, Annalisa Franco, Davide Maltoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scenarios in which restrictions in data transfer and storage limit the
+possibility to compose a single dataset -- also exploiting different data
+sources -- to perform a batch-based training procedure, make the development of
+robust models particularly challenging. We hypothesize that the recent
+Continual Learning (CL) paradigm may represent an effective solution to enable
+incremental training, even through multiple sites. Indeed, a basic assumption
+of CL is that once a model has been trained, old data can no longer be used in
+successive training iterations and in principle can be deleted. Therefore, in
+this paper, we investigate the performance of different Continual Learning
+methods in this scenario, simulating a learning model that is updated every
+time a new chunk of data, even of variable size, is available. Experimental
+results reveal that a particular CL method, namely Learning without Forgetting
+(LwF), is one of the best-performing algorithms. Then, we investigate its usage
+and parametrization in Morphing Attack Detection and Object Classification
+tasks, specifically with respect to the amount of new training data that became
+available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted in IJCB 2023 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Clustering of illustrations by atmosphere using a combination of
+  supervised and unsupervised learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15099v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15099v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keisuke Kubota, Masahiro Okuda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The distribution of illustrations on social media, such as Twitter and Pixiv
+has increased with the growing popularity of animation, games, and animated
+movies. The "atmosphere" of illustrations plays an important role in user
+preferences. Classifying illustrations by atmosphere can be helpful for
+recommendations and searches. However, assigning clear labels to the elusive
+"atmosphere" and conventional supervised classification is not always
+practical. Furthermore, even images with similar colors, edges, and low-level
+features may not have similar atmospheres, making classification based on
+low-level features challenging. In this paper, this problem is solved using
+both supervised and unsupervised learning with pseudo-labels. The feature
+vectors are obtained using the supervised method with pseudo-labels that
+contribute to an ambiguous atmosphere. Further, clustering is performed based
+on these feature vectors. Experimental analyses show that our method
+outperforms conventional methods in human-like clustering on datasets manually
+classified by humans.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Learning for Improved Synthetic Aperture Sonar Target
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15098v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15098v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        BW Sheffield
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study explores the application of self-supervised learning (SSL) for
+improved target recognition in synthetic aperture sonar (SAS) imagery. The
+unique challenges of underwater environments make traditional computer vision
+techniques, which rely heavily on optical camera imagery, less effective. SAS,
+with its ability to generate high-resolution imagery, emerges as a preferred
+choice for underwater imaging. However, the voluminous high-resolution SAS data
+presents a significant challenge for labeling; a crucial step for training deep
+neural networks (DNNs).
+  SSL, which enables models to learn features in data without the need for
+labels, is proposed as a potential solution to the data labeling challenge in
+SAS. The study evaluates the performance of two prominent SSL algorithms,
+MoCov2 and BYOL, against the well-regarded supervised learning model, ResNet18,
+for binary image classification tasks. The findings suggest that while both SSL
+models can outperform a fully supervised model with access to a small number of
+labels in a few-shot scenario, they do not exceed it when all the labels are
+used.
+  The results underscore the potential of SSL as a viable alternative to
+traditional supervised learning, capable of maintaining task performance while
+reducing the time and costs associated with data labeling. The study also
+contributes to the growing body of evidence supporting the use of SSL in remote
+sensing and could stimulate further research in this area.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gaussian Latent Representations for Uncertainty Estimation using
+  Mahalanobis Distance in Deep Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13849v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13849v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aishwarya Venkataramanan, Assia Benbihi, Martin Laviale, Cedric Pradalier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works show that the data distribution in a network's latent space is
+useful for estimating classification uncertainty and detecting
+Out-of-distribution (OOD) samples. To obtain a well-regularized latent space
+that is conducive for uncertainty estimation, existing methods bring in
+significant changes to model architectures and training procedures. In this
+paper, we present a lightweight, fast, and high-performance regularization
+method for Mahalanobis distance-based uncertainty prediction, and that requires
+minimal changes to the network's architecture. To derive Gaussian latent
+representation favourable for Mahalanobis Distance calculation, we introduce a
+self-supervised representation learning method that separates in-class
+representations into multiple Gaussians. Classes with non-Gaussian
+representations are automatically identified and dynamically clustered into
+multiple new classes that are approximately Gaussian. Evaluation on standard
+OOD benchmarks shows that our method achieves state-of-the-art results on OOD
+detection with minimal inference time, and is very competitive on predictive
+probability calibration. Finally, we show the applicability of our method to a
+real-life computer vision use case on microorganism classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages including supplementary material</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D Semantic Subspace Traverser: Empowering 3D Generative Model with
+  Shape Editing Capability <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14051v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14051v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruowei Wang, Yu Liu, Pei Su, Jianwei Zhang, Qijun Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Shape generation is the practice of producing 3D shapes as various
+representations for 3D content creation. Previous studies on 3D shape
+generation have focused on shape quality and structure, without or less
+considering the importance of semantic information. Consequently, such
+generative models often fail to preserve the semantic consistency of shape
+structure or enable manipulation of the semantic attributes of shapes during
+generation. In this paper, we proposed a novel semantic generative model named
+3D Semantic Subspace Traverser that utilizes semantic attributes for
+category-specific 3D shape generation and editing. Our method utilizes implicit
+functions as the 3D shape representation and combines a novel latent-space GAN
+with a linear subspace model to discover semantic dimensions in the local
+latent space of 3D shapes. Each dimension of the subspace corresponds to a
+particular semantic attribute, and we can edit the attributes of generated
+shapes by traversing the coefficients of those dimensions. Experimental results
+demonstrate that our method can produce plausible shapes with complex
+structures and enable the editing of semantic attributes. The code and trained
+models are available at
+https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in ICCV 2023. Code:
+  https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Low Light Image Enhancement Using SNR-Aware Swin
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02082v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02082v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhijian Luo, Jiahui Tang, Yueen Hou, Zihan Huang, Yanzeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image captured under low-light conditions presents unpleasing artifacts,
+which debilitate the performance of feature extraction for many upstream visual
+tasks. Low-light image enhancement aims at improving brightness and contrast,
+and further reducing noise that corrupts the visual quality. Recently, many
+image restoration methods based on Swin Transformer have been proposed and
+achieve impressive performance. However, on one hand, trivially employing Swin
+Transformer for low-light image enhancement would expose some artifacts,
+including over-exposure, brightness imbalance and noise corruption, etc. On the
+other hand, it is impractical to capture image pairs of low-light images and
+corresponding ground-truth, i.e. well-exposed image in same visual scene. In
+this paper, we propose a dual-branch network based on Swin Transformer, guided
+by a signal-to-noise ratio prior map which provides the spatial-varying
+information for low-light image enhancement. Moreover, we leverage unsupervised
+learning to construct the optimization objective based on Retinex model, to
+guide the training of proposed network. Experimental results demonstrate that
+the proposed model is competitive with the baseline models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI Art Curation: Re-imagining the city of Helsinki in occasion of its
+  Biennial 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03753v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03753v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ludovica Schaerf, Pepe Ballesteros, Valentine Bernasconi, Iacopo Neri, Dario Negueruela del Castillo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Art curatorial practice is characterized by the presentation of an art
+collection in a knowledgeable way. Machine processes are characterized by their
+capacity to manage and analyze large amounts of data. This paper envisages AI
+curation and audience interaction to explore the implications of contemporary
+machine learning models for the curatorial world. This project was developed
+for the occasion of the 2023 Helsinki Art Biennial, entitled New Directions May
+Emerge. We use the Helsinki Art Museum (HAM) collection to re-imagine the city
+of Helsinki through the lens of machine perception. We use visual-textual
+models to place indoor artworks in public spaces, assigning fictional
+coordinates based on similarity scores. We transform the space that each
+artwork inhabits in the city by generating synthetic 360 art panoramas. We
+guide the generation estimating depth values from 360 panoramas at each artwork
+location, and machine-generated prompts of the artworks. The result of this
+project is an AI curation that places the artworks in their imagined physical
+space, blurring the lines of artwork, context, and machine perception. The work
+is virtually presented as a web-based installation on this link
+http://newlyformedcity.net/, where users can navigate an alternative version of
+the city while exploring and interacting with its cultural heritage at scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffFit: Unlocking Transferability of Large Diffusion Models via Simple
+  Parameter-Efficient Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06648v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06648v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enze Xie, Lewei Yao, Han Shi, Zhili Liu, Daquan Zhou, Zhaoqiang Liu, Jiawei Li, Zhenguo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have proven to be highly effective in generating
+high-quality images. However, adapting large pre-trained diffusion models to
+new domains remains an open challenge, which is critical for real-world
+applications. This paper proposes DiffFit, a parameter-efficient strategy to
+fine-tune large pre-trained diffusion models that enable fast adaptation to new
+domains. DiffFit is embarrassingly simple that only fine-tunes the bias term
+and newly-added scaling factors in specific layers, yet resulting in
+significant training speed-up and reduced model storage costs. Compared with
+full fine-tuning, DiffFit achieves 2$\times$ training speed-up and only needs
+to store approximately 0.12\% of the total model parameters. Intuitive
+theoretical analysis has been provided to justify the efficacy of scaling
+factors on fast adaptation. On 8 downstream datasets, DiffFit achieves superior
+or competitive performances compared to the full fine-tuning while being more
+efficient. Remarkably, we show that DiffFit can adapt a pre-trained
+low-resolution generative model to a high-resolution one by adding minimal
+cost. Among diffusion-based methods, DiffFit sets a new state-of-the-art FID of
+3.02 on ImageNet 512$\times$512 benchmark by fine-tuning only 25 epochs from a
+public pre-trained ImageNet 256$\times$256 checkpoint while being 30$\times$
+more training efficient than the closest competitor.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tech Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive Analysis on the Leakage of Fuzzy Matchers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13717v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13717v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Axel Durbet, Paul-Marie Grollemund, Kevin Thiry-Atighehchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper provides a comprehensive analysis of information leakage during
+distance evaluation, with an emphasis on threshold-based obfuscated distance
+(i.e., Fuzzy Matcher). Leakage can occur due to a malware infection or the use
+of a weakly privacy-preserving matcher, exemplified by side channel attacks or
+partially obfuscated designs. We provide an exhaustive catalog of information
+leakage scenarios as well as their impacts on the security concerning data
+privacy. Each of the scenarios leads to generic attacks whose impacts are
+expressed in terms of computational costs, hence allowing the establishment of
+upper bounds on the security level.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Effective Priors and Efficient Models for Weakly-Supervised
+  Change Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10853v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10853v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenghui Zhao, Lixiang Ru, Chen Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-supervised change detection (WSCD) aims to detect pixel-level changes
+with only image-level annotations. Owing to its label efficiency, WSCD is
+drawing increasing attention recently. However, current WSCD methods often
+encounter the challenge of change missing and fabricating, i.e., the
+inconsistency between image-level annotations and pixel-level predictions.
+Specifically, change missing refer to the situation that the WSCD model fails
+to predict any changed pixels, even though the image-level label indicates
+changed, and vice versa for change fabricating. To address this challenge, in
+this work, we leverage global-scale and local-scale priors in WSCD and propose
+two components: a Dilated Prior (DP) decoder and a Label Gated (LG) constraint.
+The DP decoder decodes samples with the changed image-level label, skips
+samples with the unchanged label, and replaces them with an all-unchanged
+pixel-level label. The LG constraint is derived from the correspondence between
+changed representations and image-level labels, penalizing the model when it
+mispredicts the change status. Additionally, we develop TransWCD, a simple yet
+powerful transformer-based model, showcasing the potential of weakly-supervised
+learning in change detection. By integrating the DP decoder and LG constraint
+into TransWCD, we form TransWCD-DL. Our proposed TransWCD and TransWCD-DL
+achieve significant +6.33% and +9.55% F1 score improvements over the
+state-of-the-art methods on the WHU-CD dataset, respectively. Some performance
+metrics even exceed several fully-supervised change detection (FSCD)
+competitors. Code will be available at
+https://github.com/zhenghuizhao/TransWCD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE Transactions on Pattern Analysis and Machine
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distracting Downpour: Adversarial Weather Attacks for Motion Estimation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06716v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06716v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jenny Schmalfuss, Lukas Mehl, Andrés Bruhn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current adversarial attacks on motion estimation, or optical flow, optimize
+small per-pixel perturbations, which are unlikely to appear in the real world.
+In contrast, adverse weather conditions constitute a much more realistic threat
+scenario. Hence, in this work, we present a novel attack on motion estimation
+that exploits adversarially optimized particles to mimic weather effects like
+snowflakes, rain streaks or fog clouds. At the core of our attack framework is
+a differentiable particle rendering system that integrates particles (i)
+consistently over multiple time steps (ii) into the 3D space (iii) with a
+photo-realistic appearance. Through optimization, we obtain adversarial weather
+that significantly impacts the motion estimation. Surprisingly, methods that
+previously showed good robustness towards small per-pixel perturbations are
+particularly vulnerable to adversarial weather. At the same time, augmenting
+the training with non-optimized weather increases a method's robustness towards
+weather effects and improves generalizability at almost no additional cost. Our
+code will be available at https://github.com/cv-stuttgart/DistractingDownpour.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Acepted by ICCV 2023. This work is a direct extension of our extended
+  abstract from arXiv:2210.11242</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GeoUDF: Surface Reconstruction from 3D Point Clouds via Geometry-guided
+  Distance Representation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.16762v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.16762v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Ren, Junhui Hou, Xiaodong Chen, Ying He, Wenping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a learning-based method, namely GeoUDF,to tackle the long-standing
+and challenging problem of reconstructing a discrete surface from a sparse
+point cloud.To be specific, we propose a geometry-guided learning method for
+UDF and its gradient estimation that explicitly formulates the unsigned
+distance of a query point as the learnable affine averaging of its distances to
+the tangent planes of neighboring points on the surface. Besides,we model the
+local geometric structure of the input point clouds by explicitly learning a
+quadratic polynomial for each point. This not only facilitates upsampling the
+input sparse point cloud but also naturally induces unoriented normal, which
+further augments UDF estimation. Finally, to extract triangle meshes from the
+predicted UDF we propose a customized edge-based marching cube module. We
+conduct extensive experiments and ablation studies to demonstrate the
+significant advantages of our method over state-of-the-art methods in terms of
+reconstruction accuracy, efficiency, and generality. The source code is
+publicly available at https://github.com/rsy6318/GeoUDF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Pre-Train</span>ing with Diffusion models for Dental Radiography segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14066v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14066v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jérémy Rousseau, Christian Alaka, Emma Covili, Hippolyte Mayard, Laura Misrachi, Willy Au
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical radiography segmentation, and specifically dental radiography, is
+highly limited by the cost of labeling which requires specific expertise and
+labor-intensive annotations. In this work, we propose a straightforward
+pre-training method for semantic segmentation leveraging Denoising Diffusion
+Probabilistic Models (DDPM), which have shown impressive results for generative
+modeling. Our straightforward approach achieves remarkable performance in terms
+of label efficiency and does not require architectural modifications between
+pre-training and downstream tasks. We propose to first pre-train a Unet by
+exploiting the DDPM training objective, and then fine-tune the resulting model
+on a segmentation task. Our experimental results on the segmentation of dental
+radiographs demonstrate that the proposed method is competitive with
+state-of-the-art pre-training methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fine-Tuned but Zero-Shot 3D Shape Sketch View Similarity and Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08541v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08541v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianluca Berardi, Yulia Gryaditskaya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, encoders like ViT (vision transformer) and ResNet have been trained
+on vast datasets and utilized as perceptual metrics for comparing sketches and
+images, as well as multi-domain encoders in a zero-shot setting. However, there
+has been limited effort to quantify the granularity of these encoders. Our work
+addresses this gap by focusing on multi-modal 2D projections of individual 3D
+instances. This task holds crucial implications for retrieval and sketch-based
+modeling. We show that in a zero-shot setting, the more abstract the sketch,
+the higher the likelihood of incorrect image matches. Even within the same
+sketch domain, sketches of the same object drawn in different styles, for
+example by distinct individuals, might not be accurately matched. One of the
+key findings of our research is that meticulous fine-tuning on one class of 3D
+shapes can lead to improved performance on other shape classes, reaching or
+surpassing the accuracy of supervised methods. We compare and discuss several
+fine-tuning strategies. Additionally, we delve deeply into how the scale of an
+object in a sketch influences the similarity of features at different network
+layers, helping us identify which network layers provide the most accurate
+matching. Significantly, we discover that ViT and ResNet perform best when
+dealing with similar object scales. We believe that our work will have a
+significant impact on research in the sketch domain, providing insights and
+guidance on how to adopt large pretrained models as perceptual losses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Impact of Partial Occlusion on Pedestrian Detectability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.04812v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.04812v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shane Gilroy, Darragh Mullins, Edward Jones, Ashkan Parsi, Martin Glavin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust detection of vulnerable road users is a safety critical requirement
+for the deployment of autonomous vehicles in heterogeneous traffic. One of the
+most complex outstanding challenges is that of partial occlusion where a target
+object is only partially available to the sensor due to obstruction by another
+foreground object. A number of leading pedestrian detection benchmarks provide
+annotation for partial occlusion, however each benchmark varies greatly in
+their definition of the occurrence and severity of occlusion. Recent research
+demonstrates that a high degree of subjectivity is used to classify occlusion
+level in these cases and occlusion is typically categorized into 2 to 3 broad
+categories such as partially and heavily occluded. This can lead to inaccurate
+or inconsistent reporting of pedestrian detection model performance depending
+on which benchmark is used. This research introduces a novel, objective
+benchmark for partially occluded pedestrian detection to facilitate the
+objective characterization of pedestrian detection models. Characterization is
+carried out on seven popular pedestrian detection models for a range of
+occlusion levels from 0-99%, in order to demonstrate the efficacy and increased
+analysis capabilities of the proposed characterization method. Results
+demonstrate that pedestrian detection performance degrades, and the number of
+false negative detections increase as pedestrian occlusion level increases. Of
+the seven popular pedestrian detection routines characterized, CenterNet has
+the greatest overall performance, followed by SSDlite. RetinaNet has the lowest
+overall detection performance across the range of occlusion levels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This research has been published under the title "Replacing the human
+  driver: An objective benchmark for occluded pedestrian detection" in
+  Biomimetic Intelligence and Robotics
+  https://doi.org/10.1016/j.birob.2023.100115</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semantic-Aware Dual Contrastive Learning for Multi-label Image
+  Classification <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09715v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09715v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leilei Ma, Dengdi Sun, Lei Wang, Haifeng Zhao, Bin Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extracting image semantics effectively and assigning corresponding labels to
+multiple objects or attributes for natural images is challenging due to the
+complex scene contents and confusing label dependencies. Recent works have
+focused on modeling label relationships with graph and understanding object
+regions using class activation maps (CAM). However, these methods ignore the
+complex intra- and inter-category relationships among specific semantic
+features, and CAM is prone to generate noisy information. To this end, we
+propose a novel semantic-aware dual contrastive learning framework that
+incorporates sample-to-sample contrastive learning (SSCL) as well as
+prototype-to-sample contrastive learning (PSCL). Specifically, we leverage
+semantic-aware representation learning to extract category-related local
+discriminative features and construct category prototypes. Then based on SSCL,
+label-level visual representations of the same category are aggregated
+together, and features belonging to distinct categories are separated.
+Meanwhile, we construct a novel PSCL module to narrow the distance between
+positive samples and category prototypes and push negative samples away from
+the corresponding category prototypes. Finally, the discriminative label-level
+features related to the image content are accurately captured by the joint
+training of the above three parts. Experiments on five challenging large-scale
+public datasets demonstrate that our proposed method is effective and
+outperforms the state-of-the-art methods. Code and supplementary materials are
+released on https://github.com/yu-gi-oh-leilei/SADCL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, accepted by European Conference on Artificial
+  Intelligence (2023 ECAI)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ USTC FLICAR: A Sensors Fusion <span class="highlight-title">Dataset</span> of LiDAR-Inertial-Camera for
+  Heavy-duty Autonomous Aerial Work Robots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01986v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01986v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziming Wang, Yujiang Liu, Yifan Duan, Xingchen Li, Xinran Zhang, Jianmin Ji, Erbao Dong, Yanyong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present the USTC FLICAR Dataset, which is dedicated to the
+development of simultaneous localization and mapping and precise 3D
+reconstruction of the workspace for heavy-duty autonomous aerial work robots.
+In recent years, numerous public datasets have played significant roles in the
+advancement of autonomous cars and unmanned aerial vehicles (UAVs). However,
+these two platforms differ from aerial work robots: UAVs are limited in their
+payload capacity, while cars are restricted to two-dimensional movements. To
+fill this gap, we create the "Giraffe" mapping robot based on a bucket truck,
+which is equipped with a variety of well-calibrated and synchronized sensors:
+four 3D LiDARs, two stereo cameras, two monocular cameras, Inertial Measurement
+Units (IMUs), and a GNSS/INS system. A laser tracker is used to record the
+millimeter-level ground truth positions. We also make its ground twin, the
+"Okapi" mapping robot, to gather data for comparison. The proposed dataset
+extends the typical autonomous driving sensing suite to aerial scenes,
+demonstrating the potential of combining autonomous driving perception systems
+with bucket trucks to create a versatile autonomous aerial working platform.
+Moreover, based on the Segment Anything Model (SAM), we produce the Semantic
+FLICAR dataset, which provides fine-grained semantic segmentation annotations
+for multimodal continuous data in both temporal and spatial dimensions. The
+dataset is available for download at: https://ustc-flicar.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 34 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DanceFormer: Music Conditioned 3D Dance Generation with Parametric
+  Motion <span class="highlight-title">Transformer</span> <span class="chip">AAAI-22</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.10206v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.10206v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Buyu Li, Yongchi Zhao, Zhelun Shi, Lu Sheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating 3D dances from music is an emerged research task that benefits a
+lot of applications in vision and graphics. Previous works treat this task as
+sequence generation, however, it is challenging to render a music-aligned
+long-term sequence with high kinematic complexity and coherent movements. In
+this paper, we reformulate it by a two-stage process, ie, a key pose generation
+and then an in-between parametric motion curve prediction, where the key poses
+are easier to be synchronized with the music beats and the parametric curves
+can be efficiently regressed to render fluent rhythm-aligned movements. We
+named the proposed method as DanceFormer, which includes two cascading
+kinematics-enhanced transformer-guided networks (called DanTrans) that tackle
+each stage, respectively. Furthermore, we propose a large-scale music
+conditioned 3D dance dataset, called PhantomDance, that is accurately labeled
+by experienced animators rather than reconstruction or motion capture. This
+dataset also encodes dances as key poses and parametric motion curves apart
+from pose sequences, thus benefiting the training of our DanceFormer. Extensive
+experiments demonstrate that the proposed method, even trained by existing
+datasets, can generate fluent, performative, and music-matched 3D dances that
+surpass previous works quantitatively and qualitatively. Moreover, the proposed
+DanceFormer, together with the PhantomDance dataset
+(https://github.com/libuyu/PhantomDanceDataset), are seamlessly compatible with
+industrial animation software, thus facilitating the adaptation for various
+downstream applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the version accepted by AAAI-22</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exemplar-free Continual Learning of Vision <span class="highlight-title">Transformer</span>s via Gated
+  Class-Attention and Cascaded Feature Drift Compensation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.12292v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.12292v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Cotogni, Fei Yang, Claudio Cusano, Andrew D. Bagdanov, Joost van de Weijer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new method for exemplar-free class incremental training of ViTs.
+The main challenge of exemplar-free continual learning is maintaining
+plasticity of the learner without causing catastrophic forgetting of previously
+learned tasks. This is often achieved via exemplar replay which can help
+recalibrate previous task classifiers to the feature drift which occurs when
+learning new tasks. Exemplar replay, however, comes at the cost of retaining
+samples from previous tasks which for many applications may not be possible. To
+address the problem of continual ViT training, we first propose gated
+class-attention to minimize the drift in the final ViT transformer block. This
+mask-based gating is applied to class-attention mechanism of the last
+transformer block and strongly regulates the weights crucial for previous
+tasks. Importantly, gated class-attention does not require the task-ID during
+inference, which distinguishes it from other parameter isolation methods.
+Secondly, we propose a new method of feature drift compensation that
+accommodates feature drift in the backbone when learning new tasks. The
+combination of gated class-attention and cascaded feature drift compensation
+allows for plasticity towards new tasks while limiting forgetting of previous
+ones. Extensive experiments performed on CIFAR-100, Tiny-ImageNet and
+ImageNet100 demonstrate that our exemplar-free method obtains competitive
+results when compared to rehearsal based ViT methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and
+  Lane Segmentation in Self-Driving Cars 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10705v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10705v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quang Huy Che, Dinh Phuc Nguyen, Minh Quan Pham, Duc Khai Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation is a common task in autonomous driving to understand
+the surrounding environment. Driveable Area Segmentation and Lane Detection are
+particularly important for safe and efficient navigation on the road. However,
+original semantic segmentation models are computationally expensive and require
+high-end hardware, which is not feasible for embedded systems in autonomous
+vehicles. This paper proposes a lightweight model for the driveable area and
+lane line segmentation. TwinLiteNet is designed cheaply but achieves accurate
+and efficient segmentation results. We evaluate TwinLiteNet on the BDD100K
+dataset and compare it with modern models. Experimental results show that our
+TwinLiteNet performs similarly to existing approaches, requiring significantly
+fewer computational resources. Specifically, TwinLiteNet achieves a mIoU score
+of 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task
+with only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000.
+Furthermore, TwinLiteNet can run in real-time on embedded devices with limited
+computing power, especially since it achieves 60FPS on Jetson Xavier NX, making
+it an ideal solution for self-driving vehicles. Code is available:
+url{https://github.com/chequanghuy/TwinLiteNet}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted to the Conference on Multimedia
+  Analysis and Pattern Recognition (MAPR), which will be held in held in Quy
+  Nhon on October 5-6, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Common Rationale to Improve <span class="highlight-title">Self-Supervised</span> Representation for
+  Fine-Grained Visual Recognition Problems <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01669v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01669v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangyang Shu, Anton van den Hengel, Lingqiao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) strategies have demonstrated remarkable
+performance in various recognition tasks. However, both our preliminary
+investigation and recent studies suggest that they may be less effective in
+learning representations for fine-grained visual recognition (FGVR) since many
+features helpful for optimizing SSL objectives are not suitable for
+characterizing the subtle differences in FGVR. To overcome this issue, we
+propose learning an additional screening mechanism to identify discriminative
+clues commonly seen across instances and classes, dubbed as common rationales
+in this paper. Intuitively, common rationales tend to correspond to the
+discriminative patterns from the key parts of foreground objects. We show that
+a common rationale detector can be learned by simply exploiting the GradCAM
+induced from the SSL objective without using any pre-trained object parts or
+saliency detectors, making it seamlessly to be integrated with the existing SSL
+process. Specifically, we fit the GradCAM with a branch with limited fitting
+capacity, which allows the branch to capture the common rationales and discard
+the less common discriminative patterns. At the test stage, the branch
+generates a set of spatial weights to selectively aggregate features
+representing an instance. Extensive experimental results on four visual tasks
+demonstrate that the proposed method can lead to a significant improvement in
+different evaluation settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ <span class="highlight-title">★</span> MixupE: Understanding and Improving Mixup from Directional Derivative
+  Perspective <span class="chip">UAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.13381v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.13381v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingtian Zou, Vikas Verma, Sarthak Mittal, Wai Hoh Tang, Hieu Pham, Juho Kannala, <span class="highlight-author">Yoshua Bengio</span>, Arno Solin, Kenji Kawaguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mixup is a popular data augmentation technique for training deep neural
+networks where additional samples are generated by linearly interpolating pairs
+of inputs and their labels. This technique is known to improve the
+generalization performance in many learning paradigms and applications. In this
+work, we first analyze Mixup and show that it implicitly regularizes infinitely
+many directional derivatives of all orders. Based on this new insight, we
+propose an improved version of Mixup, theoretically justified to deliver better
+generalization performance than the vanilla Mixup. To demonstrate the
+effectiveness of the proposed method, we conduct experiments across various
+domains such as images, tabular data, speech, and graphs. Our results show that
+the proposed method improves Mixup across multiple datasets using a variety of
+architectures, for instance, exhibiting an improvement over Mixup by 0.8% in
+ImageNet top-1 accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, UAI 2023 oral presentation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RCM-Fusion: Radar-Camera Multi-Level Fusion for 3D Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10249v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10249v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jisong Kim, Minjae Seong, Geonho Bang, Dongsuk Kum, Jun Won Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While LiDAR sensors have been succesfully applied to 3D object detection, the
+affordability of radar and camera sensors has led to a growing interest in
+fusiong radars and cameras for 3D object detection. However, previous
+radar-camera fusion models have not been able to fully utilize radar
+information in that initial 3D proposals were generated based on the camera
+features only and the instance-level fusion is subsequently conducted. In this
+paper, we propose radar-camera multi-level fusion (RCM-Fusion), which fuses
+radar and camera modalities at both the feature-level and instance-level to
+fully utilize radar information. At the feature-level, we propose a Radar
+Guided BEV Encoder which utilizes radar Bird's-Eye-View (BEV) features to
+transform image features into precise BEV representations and then adaptively
+combines the radar and camera BEV features. At the instance-level, we propose a
+Radar Grid Point Refinement module that reduces localization error by
+considering the characteristics of the radar point clouds. The experiments
+conducted on the public nuScenes dataset demonstrate that our proposed
+RCM-Fusion offers 11.8% performance gain in nuScenes detection score (NDS) over
+the camera-only baseline model and achieves state-of-the-art performaces among
+radar-camera fusion methods in the nuScenes 3D object detection benchmark. Code
+will be made publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Factor Fields: A Unified Framework for Neural Fields and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01226v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01226v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anpei Chen, Zexiang Xu, Xinyue Wei, Siyu Tang, Hao Su, Andreas Geiger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Factor Fields, a novel framework for modeling and representing
+signals. Factor Fields decomposes a signal into a product of factors, each
+represented by a classical or neural field representation which operates on
+transformed input coordinates. This decomposition results in a unified
+framework that accommodates several recent signal representations including
+NeRF, Plenoxels, EG3D, Instant-NGP, and TensoRF. Additionally, our framework
+allows for the creation of powerful new signal representations, such as the
+"Dictionary Field" (DiF) which is a second contribution of this paper. Our
+experiments show that DiF leads to improvements in approximation quality,
+compactness, and training time when compared to previous fast reconstruction
+methods. Experimentally, our representation achieves better image approximation
+quality on 2D image regression tasks, higher geometric quality when
+reconstructing 3D signed distance fields, and higher compactness for radiance
+field reconstruction tasks. Furthermore, DiF enables generalization to unseen
+images/3D scenes by sharing bases across signals during training which greatly
+benefits use cases such as image regression from sparse observations and
+few-shot radiance field reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures; Project Page:
+  https://apchenstu.github.io/FactorFields/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SUG: Single-<span class="highlight-title">dataset</span> Unified Generalization for 3D Point Cloud
+  Classification <span class="chip">ACM MM-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09160v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09160v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Huang, Bo Zhang, Botian Shi, Peng Gao, Yikang Li, Hongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although Domain Generalization (DG) problem has been fast-growing in the 2D
+image tasks, its exploration on 3D point cloud data is still insufficient and
+challenged by more complex and uncertain cross-domain variances with uneven
+inter-class modality distribution. In this paper, different from previous 2D DG
+works, we focus on the 3D DG problem and propose a Single-dataset Unified
+Generalization (SUG) framework that only leverages a single source dataset to
+alleviate the unforeseen domain differences faced by a well-trained source
+model. Specifically, we first design a Multi-grained Sub-domain Alignment (MSA)
+method, which can constrain the learned representations to be domain-agnostic
+and discriminative, by performing a multi-grained feature alignment process
+between the splitted sub-domains from the single source dataset. Then, a
+Sample-level Domain-aware Attention (SDA) strategy is presented, which can
+selectively enhance easy-to-adapt samples from different sub-domains according
+to the sample-level inter-domain distance to avoid the negative transfer.
+Experiments demonstrate that our SUG can boost the generalization ability for
+unseen target domains, even outperforming the existing unsupervised domain
+adaptation methods that have to access extensive target domain data. Our code
+is available at https://github.com/SiyuanHuang95/SUG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM-2023, and our code is available at
+  https://github.com/SiyuanHuang95/SUG</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploiting Richness of Learned Compressed Representation of Images for
+  Semantic Segmentation <span class="chip">ICME 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01524v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01524v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ravi Kakaiya, Rakshith Sathish, Ramanathan Sethuraman, Debdoot Sheet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous vehicles and Advanced Driving Assistance Systems (ADAS) have the
+potential to radically change the way we travel. Many such vehicles currently
+rely on segmentation and object detection algorithms to detect and track
+objects around its surrounding. The data collected from the vehicles are often
+sent to cloud servers to facilitate continual/life-long learning of these
+algorithms. Considering the bandwidth constraints, the data is compressed
+before sending it to servers, where it is typically decompressed for training
+and analysis. In this work, we propose the use of a learning-based compression
+Codec to reduce the overhead in latency incurred for the decompression
+operation in the standard pipeline. We demonstrate that the learned compressed
+representation can also be used to perform tasks like semantic segmentation in
+addition to decompression to obtain the images. We experimentally validate the
+proposed pipeline on the Cityscapes dataset, where we achieve a compression
+factor up to $66 \times$ while preserving the information required to perform
+segmentation with a dice coefficient of $0.84$ as compared to $0.88$ achieved
+using decompressed images while reducing the overall compute by $11\%$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICME 2023 (Industry Track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Creative Birds: <span class="highlight-title">Self-Supervised</span> Single-View 3D Style Transfer <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14127v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14127v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renke Wang, Guimin Que, Shuo Chen, Xiang Li, Jun Li, Jian Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel method for single-view 3D style transfer
+that generates a unique 3D object with both shape and texture transfer. Our
+focus lies primarily on birds, a popular subject in 3D reconstruction, for
+which no existing single-view 3D transfer methods have been developed.The
+method we propose seeks to generate a 3D mesh shape and texture of a bird from
+two single-view images. To achieve this, we introduce a novel shape transfer
+generator that comprises a dual residual gated network (DRGNet), and a
+multi-layer perceptron (MLP). DRGNet extracts the features of source and target
+images using a shared coordinate gate unit, while the MLP generates spatial
+coordinates for building a 3D mesh. We also introduce a semantic UV texture
+transfer module that implements textural style transfer using semantic UV
+segmentation, which ensures consistency in the semantic meaning of the
+transferred regions. This module can be widely adapted to many existing
+approaches. Finally, our method constructs a novel 3D bird using a
+differentiable renderer. Experimental results on the CUB dataset verify that
+our method achieves state-of-the-art performance on the single-view 3D style
+transfer task. Code is available in https://github.com/wrk226/creative_birds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RPG-Palm: Realistic Pseudo-data Generation for Palmprint Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14016v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14016v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Shen, Jianlong Jin, Ruixin Zhang, Huaen Li, Kai Zhao, Yingyi Zhang, Jingyun Zhang, Shouhong Ding, Yang Zhao, Wei Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Palmprint recently shows great potential in recognition applications as it is
+a privacy-friendly and stable biometric. However, the lack of large-scale
+public palmprint datasets limits further research and development of palmprint
+recognition. In this paper, we propose a novel realistic pseudo-palmprint
+generation (RPG) model to synthesize palmprints with massive identities. We
+first introduce a conditional modulation generator to improve the intra-class
+diversity. Then an identity-aware loss is proposed to ensure identity
+consistency against unpaired training. We further improve the B\'ezier palm
+creases generation strategy to guarantee identity independence. Extensive
+experimental results demonstrate that synthetic pretraining significantly
+boosts the recognition model performance. For example, our model improves the
+state-of-the-art B\'ezierPalm by more than $5\%$ and $14\%$ in terms of
+TAR@FAR=1e-6 under the $1:1$ and $1:3$ Open-set protocol. When accessing only
+$10\%$ of the real training data, our method still outperforms ArcFace with
+$100\%$ real training data, indicating that we are closer to real-data-free
+palmprint recognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages,8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedFTN: Personalized Federated Learning with Deep Feature Transformation
+  Network for Multi-institutional Low-count PET Denoising 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.00570v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.00570v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Zhou, Huidong Xie, Qiong Liu, Xiongchao Chen, Xueqi Guo, Zhicheng Feng, S. Kevin Zhou, Biao Li, Axel Rominger, Kuangyu Shi, James S. Duncan, Chi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-count PET is an efficient way to reduce radiation exposure and
+acquisition time, but the reconstructed images often suffer from low
+signal-to-noise ratio (SNR), thus affecting diagnosis and other downstream
+tasks. Recent advances in deep learning have shown great potential in improving
+low-count PET image quality, but acquiring a large, centralized, and diverse
+dataset from multiple institutions for training a robust model is difficult due
+to privacy and security concerns of patient data. Moreover, low-count PET data
+at different institutions may have different data distribution, thus requiring
+personalized models. While previous federated learning (FL) algorithms enable
+multi-institution collaborative training without the need of aggregating local
+data, addressing the large domain shift in the application of
+multi-institutional low-count PET denoising remains a challenge and is still
+highly under-explored. In this work, we propose FedFTN, a personalized
+federated learning strategy that addresses these challenges. FedFTN uses a
+local deep feature transformation network (FTN) to modulate the feature outputs
+of a globally shared denoising network, enabling personalized low-count PET
+denoising for each institution. During the federated learning process, only the
+denoising network's weights are communicated and aggregated, while the FTN
+remains at the local institutions for feature transformation. We evaluated our
+method using a large-scale dataset of multi-institutional low-count PET imaging
+data from three medical centers located across three continents, and showed
+that FedFTN provides high-quality low-count PET images, outperforming previous
+baseline FL reconstruction methods across all low-count levels at all three
+institutions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Directly-Trained Spiking Neural Networks for Object Detection <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11411v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11411v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiaoyi Su, Yuhong Chou, Yifan Hu, Jianing Li, Shijie Mei, Ziyang Zhang, Guoqi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking neural networks (SNNs) are brain-inspired energy-efficient models
+that encode information in spatiotemporal dynamics. Recently, deep SNNs trained
+directly have shown great success in achieving high performance on
+classification tasks with very few time steps. However, how to design a
+directly-trained SNN for the regression task of object detection still remains
+a challenging problem. To address this problem, we propose EMS-YOLO, a novel
+directly-trained SNN framework for object detection, which is the first trial
+to train a deep SNN with surrogate gradients for object detection rather than
+ANN-SNN conversion strategies. Specifically, we design a full-spike residual
+block, EMS-ResNet, which can effectively extend the depth of the
+directly-trained SNN with low power consumption. Furthermore, we theoretically
+analyze and prove the EMS-ResNet could avoid gradient vanishing or exploding.
+The results demonstrate that our approach outperforms the state-of-the-art
+ANN-SNN conversion methods (at least 500 time steps) in extremely fewer time
+steps (only 4 time steps). It is shown that our model could achieve comparable
+performance to the ANN with the same architecture while consuming 5.83 times
+less energy on the frame-based COCO Dataset and the event-based Gen1 Dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STGlow: A Flow-based Generative Framework with Dual Graphormer for
+  Pedestrian Trajectory Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.11220v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.11220v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongqin Liang, Yuanman Li, Jiantao Zhou, Xia Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The pedestrian trajectory prediction task is an essential component of
+intelligent systems. Its applications include but are not limited to autonomous
+driving, robot navigation, and anomaly detection of monitoring systems. Due to
+the diversity of motion behaviors and the complex social interactions among
+pedestrians, accurately forecasting their future trajectory is challenging.
+Existing approaches commonly adopt GANs or CVAEs to generate diverse
+trajectories. However, GAN-based methods do not directly model data in a latent
+space, which may make them fail to have full support over the underlying data
+distribution; CVAE-based methods optimize a lower bound on the log-likelihood
+of observations, which may cause the learned distribution to deviate from the
+underlying distribution. The above limitations make existing approaches often
+generate highly biased or inaccurate trajectories. In this paper, we propose a
+novel generative flow based framework with dual graphormer for pedestrian
+trajectory prediction (STGlow). Different from previous approaches, our method
+can more precisely model the underlying data distribution by optimizing the
+exact log-likelihood of motion behaviors. Besides, our method has clear
+physical meanings for simulating the evolution of human motion behaviors. The
+forward process of the flow gradually degrades complex motion behavior into
+simple behavior, while its reverse process represents the evolution of simple
+behavior into complex motion behavior. Further, we introduce a dual graphormer
+combining with the graph structure to more adequately model the temporal
+dependencies and the mutual spatial interactions. Experimental results on
+several benchmarks demonstrate that our method achieves much better performance
+compared to previous state-of-the-art approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic 3D Registration of Dental CBCT and Face Scan Data using 2D
+  Projection Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10132v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10132v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyoung Suk Park, Chang Min Hyun, Sang-Hwy Lee, Jin Keun Seo, Kiwan Jeon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a fully automatic registration method of dental cone-beam
+computed tomography (CBCT) and face scan data. It can be used for a digital
+platform of 3D jaw-teeth-face models in a variety of applications, including 3D
+digital treatment planning and orthognathic surgery. Difficulties in accurately
+merging facial scans and CBCT images are due to the different image acquisition
+methods and limited area of correspondence between the two facial surfaces. In
+addition, it is difficult to use machine learning techniques because they use
+face-related 3D medical data with radiation exposure, which are difficult to
+obtain for training. The proposed method addresses these problems by reusing an
+existing machine-learning-based 2D landmark detection algorithm in an
+open-source library and developing a novel mathematical algorithm that
+identifies paired 3D landmarks from knowledge of the corresponding 2D
+landmarks. A main contribution of this study is that the proposed method does
+not require annotated training data of facial landmarks because it uses a
+pre-trained facial landmark detection algorithm that is known to be robust and
+generalized to various 2D face image models. Note that this reduces a 3D
+landmark detection problem to a 2D problem of identifying the corresponding
+landmarks on two 2D projection images generated from two different projection
+angles. Here, the 3D landmarks for registration were selected from the
+sub-surfaces with the least geometric change under the CBCT and face scan
+environments. For the final fine-tuning of the registration, the Iterative
+Closest Point method was applied, which utilizes geometrical information around
+the 3D landmarks. The experimental results show that the proposed method
+achieved an averaged surface distance error of 0.74 mm for three pairs of CBCT
+and face scan datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Behind Every Domain There is a Shift: Adapting Distortion-aware Vision
+  <span class="highlight-title">Transformer</span>s for Panoramic Semantic Segmentation <span class="chip">CVPR 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.11860v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.11860v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Zhang, Kailun Yang, Hao Shi, Simon Reiß, Kunyu Peng, Chaoxiang Ma, Haodong Fu, Philip H. S. Torr, Kaiwei Wang, Rainer Stiefelhagen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address panoramic semantic segmentation which is
+under-explored due to two critical challenges: (1) image distortions and object
+deformations on panoramas; (2) lack of semantic annotations in the 360-degree
+imagery. To tackle these problems, first, we propose the upgraded Transformer
+for Panoramic Semantic Segmentation, i.e., Trans4PASS+, equipped with
+Deformable Patch Embedding (DPE) and Deformable MLP (DMLPv2) modules for
+handling object deformations and image distortions whenever (before or after
+adaptation) and wherever (shallow or deep levels). Second, we enhance the
+Mutual Prototypical Adaptation (MPA) strategy via pseudo-label rectification
+for unsupervised domain adaptive panoramic segmentation. Third, aside from
+Pinhole-to-Panoramic (Pin2Pan) adaptation, we create a new dataset (SynPASS)
+with 9,080 panoramic images, facilitating Synthetic-to-Real (Syn2Real)
+adaptation scheme in 360-degree imagery. Extensive experiments are conducted,
+which cover indoor and outdoor scenarios, and each of them is investigated with
+Pin2Pan and Syn2Real regimens. Trans4PASS+ achieves state-of-the-art
+performances on four domain adaptive panoramic semantic segmentation
+benchmarks. Code is available at https://github.com/jamycheung/Trans4PASS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended version of CVPR 2022 paper arXiv:2203.01452. Code is
+  available at https://github.com/jamycheung/Trans4PASS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EasyNet: An Easy Network for 3D Industrial Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13925v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13925v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruitao Chen, Guoyang Xie, Jiaqi Liu, Jinbao Wang, Ziqi Luo, Jinfan Wang, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D anomaly detection is an emerging and vital computer vision task in
+industrial manufacturing (IM). Recently many advanced algorithms have been
+published, but most of them cannot meet the needs of IM. There are several
+disadvantages: i) difficult to deploy on production lines since their
+algorithms heavily rely on large pre-trained models; ii) hugely increase
+storage overhead due to overuse of memory banks; iii) the inference speed
+cannot be achieved in real-time. To overcome these issues, we propose an easy
+and deployment-friendly network (called EasyNet) without using pre-trained
+models and memory banks: firstly, we design a multi-scale multi-modality
+feature encoder-decoder to accurately reconstruct the segmentation maps of
+anomalous regions and encourage the interaction between RGB images and depth
+images; secondly, we adopt a multi-modality anomaly segmentation network to
+achieve a precise anomaly map; thirdly, we propose an attention-based
+information entropy fusion module for feature fusion during inference, making
+it suitable for real-time deployment. Extensive experiments show that EasyNet
+achieves an anomaly detection AUROC of 92.6% without using pre-trained models
+and memory banks. In addition, EasyNet is faster than existing methods, with a
+high frame rate of 94.55 FPS on a Tesla V100 GPU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Heterogeneous Embodied Multi-Agent Collaboration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13957v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13957v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinzhu Liu, Di Guo, Huaping Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent embodied tasks have recently been studied in complex indoor
+visual environments. Collaboration among multiple agents can improve work
+efficiency and has significant practical value. However, most of the existing
+research focuses on homogeneous multi-agent tasks. Compared with homogeneous
+agents, heterogeneous agents can leverage their different capabilities to
+allocate corresponding sub-tasks and cooperate to complete complex tasks.
+Heterogeneous multi-agent tasks are common in real-world scenarios, and the
+collaboration strategy among heterogeneous agents is a challenging and
+important problem to be solved. To study collaboration among heterogeneous
+agents, we propose the heterogeneous multi-agent tidying-up task, in which
+multiple heterogeneous agents with different capabilities collaborate with each
+other to detect misplaced objects and place them in reasonable locations. This
+is a demanding task since it requires agents to make the best use of their
+different capabilities to conduct reasonable task planning and complete the
+whole task. To solve this task, we build a heterogeneous multi-agent tidying-up
+benchmark dataset in a large number of houses with multiple rooms based on
+ProcTHOR-10K. We propose the hierarchical decision model based on misplaced
+object detection, reasonable receptacle prediction, as well as the
+handshake-based group communication mechanism. Extensive experiments are
+conducted to demonstrate the effectiveness of the proposed model. The project's
+website and videos of experiments can be found at https://hetercol.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Algorithmic Hallucinations of Near-Surface Winds: Statistical
+  Downscaling with Generative Adversarial Networks to Convection-Permitting
+  Scales 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08720v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08720v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolaas J. Annau, Alex J. Cannon, Adam H. Monahan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the application of emerging machine learning methods from
+image super-resolution (SR) to the task of statistical downscaling. We
+specifically focus on convolutional neural network-based Generative Adversarial
+Networks (GANs). Our GANs are conditioned on low-resolution (LR) inputs to
+generate high-resolution (HR) surface winds emulating Weather Research and
+Forecasting (WRF) model simulations over North America. Unlike traditional SR
+models, where LR inputs are idealized coarsened versions of the HR images, WRF
+emulation involves using non-idealized LR and HR pairs resulting in
+shared-scale mismatches due to internal variability. Our study builds upon
+current SR-based statistical downscaling by experimenting with a novel
+frequency-separation (FS) approach from the computer vision field. To assess
+the skill of SR models, we carefully select evaluation metrics, and focus on
+performance measures based on spatial power spectra. Our analyses reveal how
+GAN configurations influence spatial structures in the generated fields,
+particularly biases in spatial variability spectra. Using power spectra to
+evaluate the FS experiments reveals that successful applications of FS in
+computer vision do not translate to climate fields. However, the FS experiments
+demonstrate the sensitivity of power spectra to a commonly used GAN-based SR
+objective function, which helps interpret and understand its role in
+determining spatial structures. This result motivates the development of a
+novel partial frequency-separation scheme as a promising configuration option.
+We also quantify the influence on GAN performance of non-idealized LR fields
+resulting from internal variability. Furthermore, we conduct a spectra-based
+feature-importance experiment allowing us to explore the dependence of the
+spatial structure of generated fields on different physically relevant LR
+covariates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages, including 11 main figures, and 16 supplemental figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantifying & Modeling Multimodal Interactions: An Information
+  Decomposition Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12247v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12247v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Pu Liang, Yun Cheng, Xiang Fan, Chun Kai Ling, Suzanne Nie, Richard Chen, Zihao Deng, Nicholas Allen, Randy Auerbach, Faisal Mahmood, Ruslan Salakhutdinov, Louis-Philippe Morency
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent explosion of interest in multimodal applications has resulted in a
+wide selection of datasets and methods for representing and integrating
+information from different modalities. Despite these empirical advances, there
+remain fundamental research questions: How can we quantify the interactions
+that are necessary to solve a multimodal task? Subsequently, what are the most
+suitable multimodal models to capture these interactions? To answer these
+questions, we propose an information-theoretic approach to quantify the degree
+of redundancy, uniqueness, and synergy relating input modalities with an output
+task. We term these three measures as the PID statistics of a multimodal
+distribution (or PID for short), and introduce two new estimators for these PID
+statistics that scale to high-dimensional distributions. To validate PID
+estimation, we conduct extensive experiments on both synthetic datasets where
+the PID is known and on large-scale multimodal benchmarks where PID estimations
+are compared with human annotations. Finally, we demonstrate their usefulness
+in (1) quantifying interactions within multimodal datasets, (2) quantifying
+interactions captured by multimodal models, (3) principled approaches for model
+selection, and (4) three real-world case studies engaging with domain experts
+in pathology, mood prediction, and robotic perception where our framework helps
+to recommend strong multimodal models for each application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at: https://github.com/pliang279/PID</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RepViT: Revisiting Mobile CNN From ViT Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09283v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09283v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ao Wang, Hui Chen, Zijia Lin, Hengjun Pu, Guiguang Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, lightweight Vision Transformers (ViTs) demonstrate superior
+performance and lower latency compared with lightweight Convolutional Neural
+Networks (CNNs) on resource-constrained mobile devices. This improvement is
+usually attributed to the multi-head self-attention module, which enables the
+model to learn global representations. However, the architectural disparities
+between lightweight ViTs and lightweight CNNs have not been adequately
+examined. In this study, we revisit the efficient design of lightweight CNNs
+and emphasize their potential for mobile devices. We incrementally enhance the
+mobile-friendliness of a standard lightweight CNN, specifically MobileNetV3, by
+integrating the efficient architectural choices of lightweight ViTs. This ends
+up with a new family of pure lightweight CNNs, namely RepViT. Extensive
+experiments show that RepViT outperforms existing state-of-the-art lightweight
+ViTs and exhibits favorable latency in various vision tasks. On ImageNet,
+RepViT achieves over 80\% top-1 accuracy with nearly 1ms latency on an iPhone
+12, which is the first time for a lightweight model, to the best of our
+knowledge. Our largest model, RepViT-M3, obtains 81.4\% accuracy with only
+1.3ms latency. The code and trained models are available at
+\url{https://github.com/jameslahm/RepViT}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VITR: Augmenting Vision <span class="highlight-title">Transformer</span>s with Relation-Focused Learning for
+  Cross-Modal Information Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.06350v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.06350v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Gong, Georgina Cosma, Axel Finke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The relations expressed in user queries are vital for cross-modal information
+retrieval. Relation-focused cross-modal retrieval aims to retrieve information
+that corresponds to these relations, enabling effective retrieval across
+different modalities. Pre-trained networks, such as Contrastive Language-Image
+Pre-training (CLIP), have gained significant attention and acclaim for their
+exceptional performance in various cross-modal learning tasks. However, the
+Vision Transformer (ViT) used in these networks is limited in its ability to
+focus on image region relations. Specifically, ViT is trained to match images
+with relevant descriptions at the global level, without considering the
+alignment between image regions and descriptions. This paper introduces VITR, a
+novel network that enhances ViT by extracting and reasoning about image region
+relations based on a local encoder. VITR is comprised of two key components.
+Firstly, it extends the capabilities of ViT-based cross-modal networks by
+enabling them to extract and reason with region relations present in images.
+Secondly, VITR incorporates a fusion module that combines the reasoned results
+with global knowledge to predict similarity scores between images and
+descriptions. The proposed VITR network was evaluated through experiments on
+the tasks of relation-focused cross-modal information retrieval. The results
+derived from the analysis of the RefCOCOg, CLEVR, and Flickr30K datasets
+demonstrated that the proposed VITR network consistently outperforms
+state-of-the-art networks in image-to-text and text-to-image retrieval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Depth Estimation and Image Restoration by Deep Learning from Defocused
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.10730v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.10730v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saqib Nazir, Lorenzo Vaquero, Manuel Mucientes, Víctor M. Brea, Daniela Coltuc
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monocular depth estimation and image deblurring are two fundamental tasks in
+computer vision, given their crucial role in understanding 3D scenes.
+Performing any of them by relying on a single image is an ill-posed problem.
+The recent advances in the field of Deep Convolutional Neural Networks (DNNs)
+have revolutionized many tasks in computer vision, including depth estimation
+and image deblurring. When it comes to using defocused images, the depth
+estimation and the recovery of the All-in-Focus (Aif) image become related
+problems due to defocus physics. Despite this, most of the existing models
+treat them separately. There are, however, recent models that solve these
+problems simultaneously by concatenating two networks in a sequence to first
+estimate the depth or defocus map and then reconstruct the focused image based
+on it. We propose a DNN that solves the depth estimation and image deblurring
+in parallel. Our Two-headed Depth Estimation and Deblurring Network (2HDED:NET)
+extends a conventional Depth from Defocus (DFD) networks with a deblurring
+branch that shares the same encoder as the depth branch. The proposed method
+has been successfully tested on two benchmarks, one for indoor and the other
+for outdoor scenes: NYU-v2 and Make3D. Extensive experiments with 2HDED:NET on
+these benchmarks have demonstrated superior or close performances to those of
+the state-of-the-art models for depth estimation and image deblurring.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Challenges of 3D Surface Reconstruction in Capsule Endoscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.10390v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.10390v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olivier Rukundo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Essential for improving the accuracy and reliability of bowel cancer
+screening, three-dimensional (3D) surface reconstruction using capsule
+endoscopy (CE) images remains challenging due to CE hardware and software
+limitations. This report generally focuses on challenges associated with 3D
+visualization and specifically investigates the impact of the indeterminate
+selection of the angle of the line of sight on 3D surfaces. Furthermore, it
+demonstrates that impact through 3D surfaces viewed at the same azimuth angles
+and different elevation angles of the line of sight. The report concludes that
+3D printing of reconstructed 3D surfaces can potentially overcome line of sight
+indeterminate selection and 2D screen visual restriction-related errors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Dynamic Query Combinations for <span class="highlight-title">Transformer</span>-based Object
+  Detection and Segmentation <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12239v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12239v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Cui, Linjie Yang, Haichao Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based detection and segmentation methods use a list of learned
+detection queries to retrieve information from the transformer network and
+learn to predict the location and category of one specific object from each
+query. We empirically find that random convex combinations of the learned
+queries are still good for the corresponding models. We then propose to learn a
+convex combination with dynamic coefficients based on the high-level semantics
+of the image. The generated dynamic queries, named modulated queries, better
+capture the prior of object locations and categories in the different images.
+Equipped with our modulated queries, a wide range of DETR-based models achieve
+consistent and superior performance across multiple tasks including object
+detection, instance segmentation, panoptic segmentation, and video instance
+segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures, ICML 2023, code is available at
+  https://github.com/bytedance/DQ-Det</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Augmenting CLIP with Improved Visio-Linguistic Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09233v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09233v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samyadeep Basu, Maziar Sanjabi, Daniela Massiceti, Shell Xu Hu, Soheil Feizi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-text contrastive models such as CLIP are useful for a variety of
+downstream applications including zero-shot classification, image-text
+retrieval and transfer learning. However, these contrastively trained
+vision-language models often fail on compositional visio-linguistic tasks such
+as Winoground with performance equivalent to random chance. In our paper, we
+address this issue and propose a sample-efficient light-weight method called
+SDS-CLIP to improve the compositional visio-linguistic reasoning capabilities
+of CLIP. The core idea of our method is to use differentiable image
+parameterizations to fine-tune CLIP with a distillation objective from large
+text-to-image generative models such as Stable-Diffusion which are relatively
+good at visio-linguistic reasoning tasks. On the challenging Winoground
+compositional reasoning benchmark, our method improves the absolute
+visio-linguistic performance of different CLIP models by up to 7%, while on the
+ARO dataset, our method improves the visio-linguistic performance by upto 3%.
+As a byproduct of inducing visio-linguistic reasoning into CLIP, we also find
+that the zero-shot performance improves marginally on a variety of downstream
+datasets. Our method reinforces that carefully designed distillation objectives
+from generative models can be leveraged to extend existing contrastive
+image-text models with improved visio-linguistic reasoning capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-modal Manifold Cutmix for <span class="highlight-title">Self-supervised</span> Video Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.03906v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.03906v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Srijan Das, Michael S. Ryoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive representation learning of videos highly relies on the
+availability of millions of unlabelled videos. This is practical for videos
+available on web but acquiring such large scale of videos for real-world
+applications is very expensive and laborious.
+  Therefore, in this paper we focus on designing video augmentation for
+self-supervised learning, we first analyze the best strategy to mix videos to
+create a new augmented video sample. Then, the question remains, can we make
+use of the other modalities in videos for data mixing? To this end, we propose
+Cross-Modal Manifold Cutmix (CMMC) that inserts a video tesseract into another
+video tesseract in the feature space across two different modalities. We find
+that our video mixing strategy STC-mix, i.e. preliminary mixing of videos
+followed by CMMC across different modalities in a video, improves the quality
+of learned video representations. We conduct thorough experiments for two
+downstream tasks: action recognition and video retrieval on two small scale
+video datasets UCF101, and HMDB51. We also demonstrate the effectiveness of our
+STC-mix on NTU dataset where domain knowledge is limited.
+  We show that the performance of our STC-mix on both the downstream tasks is
+on par with the other self-supervised approaches while requiring less training
+data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MVA 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">14</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On (Normalised) Discounted Cumulative Gain as an Offline Evaluation
+  Metric for Top-$n$ Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olivier Jeunen, Ivan Potapov, Aleksei Ustimenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Approaches to recommendation are typically evaluated in one of two ways: (1)
+via a (simulated) online experiment, often seen as the gold standard, or (2)
+via some offline evaluation procedure, where the goal is to approximate the
+outcome of an online experiment. Several offline evaluation metrics have been
+adopted in the literature, inspired by ranking metrics prevalent in the field
+of Information Retrieval. (Normalised) Discounted Cumulative Gain (nDCG) is one
+such metric that has seen widespread adoption in empirical studies, and higher
+(n)DCG values have been used to present new methods as the state-of-the-art in
+top-$n$ recommendation for many years.
+  Our work takes a critical look at this approach, and investigates when we can
+expect such metrics to approximate the gold standard outcome of an online
+experiment. We formally present the assumptions that are necessary to consider
+DCG an unbiased estimator of online reward and provide a derivation for this
+metric from first principles, highlighting where we deviate from its
+traditional uses in IR. Importantly, we show that normalising the metric
+renders it inconsistent, in that even when DCG is unbiased, ranking competing
+methods by their normalised DCG can invert their relative order. Through a
+correlation analysis between off- and on-line experiments conducted on a
+large-scale recommendation platform, we show that our unbiased DCG estimates
+strongly correlate with online reward, even when some of the metric's inherent
+assumptions are violated. This statement no longer holds for its normalised
+variant, suggesting that nDCG's practical utility may be limited.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Effect of Third Party Implementations on Reproducibility <span class="chip">RecSys'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Balázs Hidasi, Ádám Tibor Czapp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reproducibility of recommender systems research has come under scrutiny
+during recent years. Along with works focusing on repeating experiments with
+certain algorithms, the research community has also started discussing various
+aspects of evaluation and how these affect reproducibility. We add a novel
+angle to this discussion by examining how unofficial third-party
+implementations could benefit or hinder reproducibility. Besides giving a
+general overview, we thoroughly examine six third-party implementations of a
+popular recommender algorithm and compare them to the official version on five
+public datasets. In the light of our alarming findings we aim to draw the
+attention of the research community to this neglected aspect of
+reproducibility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Appearing in the Proceedings of the 17th ACM Conference on
+  Recommender Systems (RecSys'23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Widespread Flaws in Offline Evaluation of Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14951v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14951v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Balázs Hidasi, Ádám Tibor Czapp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Even though offline evaluation is just an imperfect proxy of online
+performance -- due to the interactive nature of recommenders -- it will
+probably remain the primary way of evaluation in recommender systems research
+for the foreseeable future, since the proprietary nature of production
+recommenders prevents independent validation of A/B test setups and
+verification of online results. Therefore, it is imperative that offline
+evaluation setups are as realistic and as flawless as they can be.
+Unfortunately, evaluation flaws are quite common in recommender systems
+research nowadays, due to later works copying flawed evaluation setups from
+their predecessors without questioning their validity. In the hope of improving
+the quality of offline evaluation of recommender systems, we discuss four of
+these widespread flaws and why researchers should avoid them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Appearing in the Proceedings of the 17th ACM Conference on
+  Recommender Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Session-Based <span class="highlight-title">Transformer</span> Recommendations using Optimized
+  Negative Sampling and Loss Functions <span class="chip">RecSys '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timo Wilm, Philipp Normann, Sophie Baumeister, Paul-Vincent Kobow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work introduces TRON, a scalable session-based Transformer Recommender
+using Optimized Negative-sampling. Motivated by the scalability and performance
+limitations of prevailing models such as SASRec and GRU4Rec+, TRON integrates
+top-k negative sampling and listwise loss functions to enhance its
+recommendation accuracy. Evaluations on relevant large-scale e-commerce
+datasets show that TRON improves upon the recommendation quality of current
+methods while maintaining training speeds similar to SASRec. A live A/B test
+yielded an 18.14% increase in click-through rate over SASRec, highlighting the
+potential of TRON in practical settings. For further research, we provide
+access to our source code at https://github.com/otto-de/TRON and an anonymized
+dataset at https://github.com/otto-de/recsys-dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the Seventeenth ACM Conference on Recommender Systems
+  (RecSys '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reconciling the accuracy-diversity trade-off in recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kenny Peng, Manish Raghavan, Emma Pierson, Jon Kleinberg, Nikhil Garg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recommendation settings, there is an apparent trade-off between the goals
+of accuracy (to recommend items a user is most likely to want) and diversity
+(to recommend items representing a range of categories). As such, real-world
+recommender systems often explicitly incorporate diversity separately from
+accuracy. This approach, however, leaves a basic question unanswered: Why is
+there a trade-off in the first place?
+  We show how the trade-off can be explained via a user's consumption
+constraints -- users typically only consume a few of the items they are
+recommended. In a stylized model we introduce, objectives that account for this
+constraint induce diverse recommendations, while objectives that do not account
+for this constraint induce homogeneous recommendations. This suggests that
+accuracy and diversity appear misaligned because standard accuracy metrics do
+not consider consumption constraints. Our model yields precise and
+interpretable characterizations of diversity in different settings, giving
+practical insights into the design of diverse recommendations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Big Brother: Attacking Search Engines with Encodings <span class="chip">RAID</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14031v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14031v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicholas Boucher, Luca Pajola, Ilia Shumailov, Ross Anderson, Mauro Conti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Search engines are vulnerable to attacks against indexing and searching via
+text encoding manipulation. By imperceptibly perturbing text using uncommon
+encoded representations, adversaries can control results across search engines
+for specific search queries. We demonstrate that this attack is successful
+against two major commercial search engines - Google and Bing - and one open
+source search engine - Elasticsearch. We further demonstrate that this attack
+is successful against LLM chat search including Bing's GPT-4 chatbot and
+Google's Bard chatbot. We also present a variant of the attack targeting text
+summarization and plagiarism detection models, two ML tasks closely tied to
+search. We provide a set of defenses against these techniques and warn that
+adversaries can leverage these attacks to launch disinformation campaigns
+against unsuspecting users, motivating the need for search engine maintainers
+to patch deployed systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in the 26th Symposium on Research in Attacks, Intrusions
+  and Defenses (RAID). Revisions: Adds table summarizing attacks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast and Examination-agnostic Reciprocal Recommendation in Matching
+  Markets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09060v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09060v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoji Tomita, Riku Togashi, Yuriko Hashizume, Naoto Ohsaka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In matching markets such as job posting and online dating platforms, the
+recommender system plays a critical role in the success of the platform. Unlike
+standard recommender systems that suggest items to users, reciprocal
+recommender systems (RRSs) that suggest other users must take into account the
+mutual interests of users. In addition, ensuring that recommendation
+opportunities do not disproportionately favor popular users is essential for
+the total number of matches and for fairness among users. Existing
+recommendation methods in matching markets, however, face computational
+challenges on real-world scale platforms and depend on specific examination
+functions in the position-based model (PBM). In this paper, we introduce the
+reciprocal recommendation method based on the matching with transferable
+utility (TU matching) model in the context of ranking recommendations in
+matching markets, and propose a faster and examination-agnostic algorithm.
+Furthermore, we evaluate our approach on experiments with synthetic data and
+real-world data from an online dating platform in Japan. Our method performs
+better than or as well as existing methods in terms of the total number of
+matches and works well even in relatively large datasets for which one existing
+method does not work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-Context Retrieval-Augmented Language Models <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00083v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00083v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ori Ram, Yoav Levine, Itay Dalmedigos, Dor Muhlgay, Amnon Shashua, Kevin Leyton-Brown, Yoav Shoham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Language Modeling (RALM) methods, which condition a
+language model (LM) on relevant documents from a grounding corpus during
+generation, were shown to significantly improve language modeling performance.
+In addition, they can mitigate the problem of factually inaccurate text
+generation and provide natural source attribution mechanism. Existing RALM
+approaches focus on modifying the LM architecture in order to facilitate the
+incorporation of external information, significantly complicating deployment.
+This paper considers a simple alternative, which we dub In-Context RALM:
+leaving the LM architecture unchanged and prepending grounding documents to the
+input, without any further training of the LM. We show that In-Context RALM
+that builds on off-the-shelf general purpose retrievers provides surprisingly
+large LM gains across model sizes and diverse corpora. We also demonstrate that
+the document retrieval and ranking mechanism can be specialized to the RALM
+setting to further boost performance. We conclude that In-Context RALM has
+considerable potential to increase the prevalence of LM grounding, particularly
+in settings where a pretrained LM must be used without modification or even via
+API access.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Transactions of the Association for
+  Computational Linguistics (TACL). pre-MIT Press publication version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph-Based Recommendation System Enhanced with Community Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.03622v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.03622v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeinab Shokrzadeh, Mohammad-Reza Feizi-Derakhshi, Mohammad-Ali Balafar, Jamshid Bagherzadeh-Mohasefi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many researchers have used tag information to improve the performance of
+recommendation techniques in recommender systems. Examining the tags of users
+will help to get their interests and leads to more accuracy in the
+recommendations. Since user-defined tags are chosen freely and without any
+restrictions, problems arise in determining their exact meaning and the
+similarity of tags. However, using thesaurus and ontologies to find the meaning
+of tags is not very efficient due to their free definition by users and the use
+of different languages in many data sets. Therefore, this article uses
+mathematical and statistical methods to determine lexical similarity and
+co-occurrence tags solution to assign semantic similarity. On the other hand,
+due to the change of users' interests over time this article has considered the
+time of tag assignments in co-occurrence tags for determining similarity of
+tags. Then the graph is created based on similarity of tags. For modeling the
+interests of the users, the communities of tags are determined by using
+community detection methods. So, recommendations based on the communities of
+tags and similarity between resources are done. The performance of the proposed
+method has been evaluated using two criteria of precision and recall through
+evaluations on two public datasets. The evaluation results show that the
+precision and recall of the proposed method have significantly improved,
+compared to the other methods. According to the experimental results, the
+criteria of recall and precision have been improved, on average by 5% and 7%
+respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is a preprint of an article published in "Scientific
+  Programming"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RRAML: Reinforced Retrieval Augmented Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12798v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12798v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Bacciu, Florin Cuconasu, Federico Siciliano, Fabrizio Silvestri, Nicola Tonellotto, Giovanni Trappolini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large language models (LLMs) has revolutionized machine
+learning and related fields, showcasing remarkable abilities in comprehending,
+generating, and manipulating human language. However, their conventional usage
+through API-based text prompt submissions imposes certain limitations in terms
+of context constraints and external source availability. To address these
+challenges, we propose a novel framework called Reinforced Retrieval Augmented
+Machine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs
+with supporting information retrieved by a purpose-built retriever from a vast
+user-provided database. By leveraging recent advancements in reinforcement
+learning, our method effectively addresses several critical challenges.
+Firstly, it circumvents the need for accessing LLM gradients. Secondly, our
+method alleviates the burden of retraining LLMs for specific tasks, as it is
+often impractical or impossible due to restricted access to the model and the
+computational intensity involved. Additionally we seamlessly link the
+retriever's task with the reasoner, mitigating hallucinations and reducing
+irrelevant, and potentially damaging retrieved documents. We believe that the
+research agenda outlined in this paper has the potential to profoundly impact
+the field of AI, democratizing access to and utilization of LLMs for a wide
+range of entities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupling Knowledge from Memorization: Retrieval-augmented <span class="highlight-title">Prompt</span>
+  Learning <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.14704v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.14704v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Chen, Lei Li, Ningyu Zhang, Xiaozhuan Liang, Shumin Deng, Chuanqi Tan, Fei Huang, Luo Si, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt learning approaches have made waves in natural language processing by
+inducing better few-shot performance while they still follow a parametric-based
+learning paradigm; the oblivion and rote memorization problems in learning may
+encounter unstable generalization issues. Specifically, vanilla prompt learning
+may struggle to utilize atypical instances by rote during fully-supervised
+training or overfit shallow patterns with low-shot data. To alleviate such
+limitations, we develop RetroPrompt with the motivation of decoupling knowledge
+from memorization to help the model strike a balance between generalization and
+memorization. In contrast with vanilla prompt learning, RetroPrompt constructs
+an open-book knowledge-store from training instances and implements a retrieval
+mechanism during the process of input, training and inference, thus equipping
+the model with the ability to retrieve related contexts from the training
+corpus as cues for enhancement. Extensive experiments demonstrate that
+RetroPrompt can obtain better performance in both few-shot and zero-shot
+settings. Besides, we further illustrate that our proposed RetroPrompt can
+yield better generalization abilities with new datasets. Detailed analysis of
+memorization indeed reveals RetroPrompt can reduce the reliance of language
+models on memorization; thus, improving generalization for downstream tasks.
+Code is available in
+https://github.com/zjunlp/PromptKG/tree/main/research/RetroPrompt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2022 (Spotlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Information Retrieval Meets Large Language Models: A Strategic Report
+  from Chinese IR Community 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09751v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09751v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyao Ai, Ting Bai, Zhao Cao, Yi Chang, Jiawei Chen, Zhumin Chen, Zhiyong Cheng, Shoubin Dong, Zhicheng Dou, Fuli Feng, Shen Gao, Jiafeng Guo, Xiangnan He, Yanyan Lan, Chenliang Li, Yiqun Liu, Ziyu Lyu, Weizhi Ma, Jun Ma, Zhaochun Ren, Pengjie Ren, Zhiqiang Wang, Mingwen Wang, Ji-Rong Wen, Le Wu, Xin Xin, Jun Xu, Dawei Yin, Peng Zhang, Fan Zhang, Weinan Zhang, Min Zhang, Xiaofei Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The research field of Information Retrieval (IR) has evolved significantly,
+expanding beyond traditional search to meet diverse user information needs.
+Recently, Large Language Models (LLMs) have demonstrated exceptional
+capabilities in text understanding, generation, and knowledge inference,
+opening up exciting avenues for IR research. LLMs not only facilitate
+generative retrieval but also offer improved solutions for user understanding,
+model evaluation, and user-system interactions. More importantly, the
+synergistic relationship among IR models, LLMs, and humans forms a new
+technical paradigm that is more powerful for information seeking. IR models
+provide real-time and relevant information, LLMs contribute internal knowledge,
+and humans play a central role of demanders and evaluators to the reliability
+of information services. Nevertheless, significant challenges exist, including
+computational costs, credibility concerns, domain-specific limitations, and
+ethical considerations. To thoroughly discuss the transformative impact of LLMs
+on IR research, the Chinese IR community conducted a strategic workshop in
+April 2023, yielding valuable insights. This paper provides a summary of the
+workshop's outcomes, including the rethinking of IR's core values, the mutual
+enhancement of LLMs and IR, the proposal of a novel IR technical paradigm, and
+open challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Answering Climate Questionnaires from Unstructured Climate
+  Reports 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.04253v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.04253v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Spokoyny, Tanmay Laud, Tom Corringham, Taylor Berg-Kirkpatrick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The topic of Climate Change (CC) has received limited attention in NLP
+despite its urgency. Activists and policymakers need NLP tools to effectively
+process the vast and rapidly growing unstructured textual climate reports into
+structured form. To tackle this challenge we introduce two new large-scale
+climate questionnaire datasets and use their existing structure to train
+self-supervised models. We conduct experiments to show that these models can
+learn to generalize to climate disclosures of different organizations types
+than seen during training. We then use these models to help align texts from
+unstructured climate documents to the semi-structured questionnaires in a human
+pilot study. Finally, to support further NLP research in the climate domain we
+introduce a benchmark of existing climate text classification datasets to
+better evaluate and compare existing models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Missing Data: Aleatoric Uncertainty-Aware Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.11679v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.11679v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxu Wang, Fuli Feng, Yang Zhang, Qifan Wang, Xunhan Hu, Xiangnan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Historical interactions are the default choice for recommender model
+training, which typically exhibit high sparsity, i.e., most user-item pairs are
+unobserved missing data. A standard choice is treating the missing data as
+negative training samples and estimating interaction likelihood between
+user-item pairs along with the observed interactions. In this way, some
+potential interactions are inevitably mislabeled during training, which will
+hurt the model fidelity, hindering the model to recall the mislabeled items,
+especially the long-tail ones. In this work, we investigate the mislabeling
+issue from a new perspective of aleatoric uncertainty, which describes the
+inherent randomness of missing data. The randomness pushes us to go beyond
+merely the interaction likelihood and embrace aleatoric uncertainty modeling.
+Towards this end, we propose a new Aleatoric Uncertainty-aware Recommendation
+(AUR) framework that consists of a new uncertainty estimator along with a
+normal recommender model. According to the theory of aleatoric uncertainty, we
+derive a new recommendation objective to learn the estimator. As the chance of
+mislabeling reflects the potential of a pair, AUR makes recommendations
+according to the uncertainty, which is demonstrated to improve the
+recommendation performance of less popular items without sacrificing the
+overall performance. We instantiate AUR on three representative recommender
+models: Matrix Factorization (MF), LightGCN, and VAE from mainstream model
+architectures. Extensive results on two real-world datasets validate the
+effectiveness of AUR w.r.t. better recommendation results, especially on
+long-tail items.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">131</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On (Normalised) Discounted Cumulative Gain as an Offline Evaluation
+  Metric for Top-$n$ Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olivier Jeunen, Ivan Potapov, Aleksei Ustimenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Approaches to recommendation are typically evaluated in one of two ways: (1)
+via a (simulated) online experiment, often seen as the gold standard, or (2)
+via some offline evaluation procedure, where the goal is to approximate the
+outcome of an online experiment. Several offline evaluation metrics have been
+adopted in the literature, inspired by ranking metrics prevalent in the field
+of Information Retrieval. (Normalised) Discounted Cumulative Gain (nDCG) is one
+such metric that has seen widespread adoption in empirical studies, and higher
+(n)DCG values have been used to present new methods as the state-of-the-art in
+top-$n$ recommendation for many years.
+  Our work takes a critical look at this approach, and investigates when we can
+expect such metrics to approximate the gold standard outcome of an online
+experiment. We formally present the assumptions that are necessary to consider
+DCG an unbiased estimator of online reward and provide a derivation for this
+metric from first principles, highlighting where we deviate from its
+traditional uses in IR. Importantly, we show that normalising the metric
+renders it inconsistent, in that even when DCG is unbiased, ranking competing
+methods by their normalised DCG can invert their relative order. Through a
+correlation analysis between off- and on-line experiments conducted on a
+large-scale recommendation platform, we show that our unbiased DCG estimates
+strongly correlate with online reward, even when some of the metric's inherent
+assumptions are violated. This statement no longer holds for its normalised
+variant, suggesting that nDCG's practical utility may be limited.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Transformer</span>-based Approach for Arabic Offline Handwritten Text
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15045v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15045v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saleh Momeni, Bagher BabaAli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Handwriting recognition is a challenging and critical problem in the fields
+of pattern recognition and machine learning, with applications spanning a wide
+range of domains. In this paper, we focus on the specific issue of recognizing
+offline Arabic handwritten text. Existing approaches typically utilize a
+combination of convolutional neural networks for image feature extraction and
+recurrent neural networks for temporal modeling, with connectionist temporal
+classification used for text generation. However, these methods suffer from a
+lack of parallelization due to the sequential nature of recurrent neural
+networks. Furthermore, these models cannot account for linguistic rules,
+necessitating the use of an external language model in the post-processing
+stage to boost accuracy. To overcome these issues, we introduce two alternative
+architectures, namely the Transformer Transducer and the standard
+sequence-to-sequence Transformer, and compare their performance in terms of
+accuracy and speed. Our approach can model language dependencies and relies
+only on the attention mechanism, thereby making it more parallelizable and less
+complex. We employ pre-trained Transformers for both image understanding and
+language modeling. Our evaluation on the Arabic KHATT dataset demonstrates that
+our proposed method outperforms the current state-of-the-art approaches for
+recognizing offline Arabic handwritten text.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal and Transferable Adversarial Attacks on Aligned Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15043v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15043v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andy Zou, Zifan Wang, J. Zico Kolter, Matt Fredrikson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Because "out-of-the-box" large language models are capable of generating a
+great deal of objectionable content, recent work has focused on aligning these
+models in an attempt to prevent undesirable generation. While there has been
+some success at circumventing these measures -- so-called "jailbreaks" against
+LLMs -- these attacks have required significant human ingenuity and are brittle
+in practice. In this paper, we propose a simple and effective attack method
+that causes aligned language models to generate objectionable behaviors.
+Specifically, our approach finds a suffix that, when attached to a wide range
+of queries for an LLM to produce objectionable content, aims to maximize the
+probability that the model produces an affirmative response (rather than
+refusing to answer). However, instead of relying on manual engineering, our
+approach automatically produces these adversarial suffixes by a combination of
+greedy and gradient-based search techniques, and also improves over past
+automatic prompt generation methods.
+  Surprisingly, we find that the adversarial prompts generated by our approach
+are quite transferable, including to black-box, publicly released LLMs.
+Specifically, we train an adversarial attack suffix on multiple prompts (i.e.,
+queries asking for many different types of objectionable content), as well as
+multiple models (in our case, Vicuna-7B and 13B). When doing so, the resulting
+attack suffix is able to induce objectionable content in the public interfaces
+to ChatGPT, Bard, and Claude, as well as open source LLMs such as LLaMA-2-Chat,
+Pythia, Falcon, and others. In total, this work significantly advances the
+state-of-the-art in adversarial attacks against aligned language models,
+raising important questions about how such systems can be prevented from
+producing objectionable information. Code is available at
+github.com/llm-attacks/llm-attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Speeding up Fourier Neural Operators via Mixed Precision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15034v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15034v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Colin White, Renbo Tu, Jean Kossaifi, Gennady Pekhimenko, Kamyar Azizzadenesheli, Anima Anandkumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Fourier neural operator (FNO) is a powerful technique for learning
+surrogate maps for partial differential equation (PDE) solution operators. For
+many real-world applications, which often require high-resolution data points,
+training time and memory usage are significant bottlenecks. While there are
+mixed-precision training techniques for standard neural networks, those work
+for real-valued datatypes on finite dimensions and therefore cannot be directly
+applied to FNO, which crucially operates in the (complex-valued) Fourier domain
+and in function spaces. On the other hand, since the Fourier transform is
+already an approximation (due to discretization error), we do not need to
+perform the operation at full precision. In this work, we (i) profile memory
+and runtime for FNO with full and mixed-precision training, (ii) conduct a
+study on the numerical stability of mixed-precision training of FNO, and (iii)
+devise a training routine which substantially decreases training time and
+memory usage (up to 34%), with little or no reduction in accuracy, on the
+Navier-Stokes and Darcy flow equations. Combined with the recently proposed
+tensorized FNO (Kossaifi et al., 2023), the resulting model has far better
+performance while also being significantly faster than the original FNO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Graph <span class="highlight-title">Transformer</span> for Deepfake Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aminollah Khormali, Jiann-Shiun Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deepfake detection methods have shown promising results in recognizing
+forgeries within a given dataset, where training and testing take place on the
+in-distribution dataset. However, their performance deteriorates significantly
+when presented with unseen samples. As a result, a reliable deepfake detection
+system must remain impartial to forgery types, appearance, and quality for
+guaranteed generalizable detection performance. Despite various attempts to
+enhance cross-dataset generalization, the problem remains challenging,
+particularly when testing against common post-processing perturbations, such as
+video compression or blur. Hence, this study introduces a deepfake detection
+framework, leveraging a self-supervised pre-training model that delivers
+exceptional generalization ability, withstanding common corruptions and
+enabling feature explainability. The framework comprises three key components:
+a feature extractor based on vision Transformer architecture that is
+pre-trained via self-supervised contrastive learning methodology, a graph
+convolution network coupled with a Transformer discriminator, and a graph
+Transformer relevancy map that provides a better understanding of manipulated
+regions and further explains the model's decision. To assess the effectiveness
+of the proposed framework, several challenging experiments are conducted,
+including in-data distribution performance, cross-dataset, cross-manipulation
+generalization, and robustness against common post-production perturbations.
+The results achieved demonstrate the remarkable effectiveness of the proposed
+deepfake detection framework, surpassing the current state-of-the-art
+approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Samplable Anonymous Aggregation for Private Federated Data Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunal Talwar, Shan Wang, Audra McMillan, Vojta Jina, Vitaly Feldman, Bailey Basile, Aine Cahill, Yi Sheng Chan, Mike Chatzidakis, Junye Chen, Oliver Chick, Mona Chitnis, Suman Ganta, Yusuf Goren, Filip Granqvist, Kristine Guo, Frederic Jacobs, Omid Javidbakht, Albert Liu, Richard Low, Dan Mascenik, Steve Myers, David Park, Wonhee Park, Gianni Parsa, Tommy Pauly, Christian Priebe, Rehan Rishi, Guy Rothblum, Michael Scaria, Linmao Song, Congzheng Song, Karl Tarbe, Sebastian Vogt, Luke Winstrom, Shundong Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We revisit the problem of designing scalable protocols for private statistics
+and private federated learning when each device holds its private data. Our
+first contribution is to propose a simple primitive that allows for efficient
+implementation of several commonly used algorithms, and allows for privacy
+accounting that is close to that in the central setting without requiring the
+strong trust assumptions it entails. Second, we propose a system architecture
+that implements this primitive and perform a security analysis of the proposed
+system.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Good is Google Bard's Visual Understanding? An Empirical Study on
+  Open Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotong Qin, Ge-Peng Ji, Salman Khan, Deng-Ping Fan, Fahad Shahbaz Khan, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Google's Bard has emerged as a formidable competitor to OpenAI's ChatGPT in
+the field of conversational AI. Notably, Bard has recently been updated to
+handle visual inputs alongside text prompts during conversations. Given Bard's
+impressive track record in handling textual inputs, we explore its capabilities
+in understanding and interpreting visual data (images) conditioned by text
+questions. This exploration holds the potential to unveil new insights and
+challenges for Bard and other forthcoming multi-modal Generative models,
+especially in addressing complex computer vision problems that demand accurate
+visual and language understanding. Specifically, in this study, we focus on 15
+diverse task scenarios encompassing regular, camouflaged, medical, under-water
+and remote sensing data to comprehensively evaluate Bard's performance. Our
+primary finding indicates that Bard still struggles in these vision scenarios,
+highlighting the significant gap in vision-based understanding that needs to be
+bridged in future developments. We expect that this empirical study will prove
+valuable in advancing future models, leading to enhanced capabilities in
+comprehending and interpreting fine-grained visual data. Our project is
+released on https://github.com/htqin/GoogleBard-VisUnderstand
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harnessing Synthetic Active Particles for Physical Reservoir Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15010v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15010v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangzun Wang, Frank Cichos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The processing of information is an indispensable property of living systems
+realized by networks of active processes with enormous complexity. They have
+inspired many variants of modern machine learning one of them being reservoir
+computing, in which stimulating a network of nodes with fading memory enables
+computations and complex predictions. Reservoirs are implemented on computer
+hardware, but also on unconventional physical substrates such as mechanical
+oscillators, spins, or bacteria often summarized as physical reservoir
+computing. Here we demonstrate physical reservoir computing with a synthetic
+active microparticle system that self-organizes from an active and passive
+component into inherently noisy nonlinear dynamical units. The
+self-organization and dynamical response of the unit is the result of a delayed
+propulsion of the microswimmer to a passive target. A reservoir of such units
+with a self-coupling via the delayed response can perform predictive tasks
+despite the strong noise resulting from Brownian motion of the microswimmers.
+To achieve efficient noise suppression, we introduce a special architecture
+that uses historical reservoir states for output. Our results pave the way for
+the study of information processing in synthetic self-organized active particle
+systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Verifiable Feature Attributions: A Bridge between Post Hoc
+  Explainability and Inherent Interpretability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15007v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15007v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Usha Bhalla, Suraj Srinivas, Himabindu Lakkaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the increased deployment of machine learning models in various
+real-world applications, researchers and practitioners alike have emphasized
+the need for explanations of model behaviour. To this end, two broad strategies
+have been outlined in prior literature to explain models. Post hoc explanation
+methods explain the behaviour of complex black-box models by highlighting
+features that are critical to model predictions; however, prior work has shown
+that these explanations may not be faithful, and even more concerning is our
+inability to verify them. Specifically, it is nontrivial to evaluate if a given
+attribution is correct with respect to the underlying model. Inherently
+interpretable models, on the other hand, circumvent these issues by explicitly
+encoding explanations into model architecture, meaning their explanations are
+naturally faithful and verifiable, but they often exhibit poor predictive
+performance due to their limited expressive power. In this work, we aim to
+bridge the gap between the aforementioned strategies by proposing Verifiability
+Tuning (VerT), a method that transforms black-box models into models that
+naturally yield faithful and verifiable feature attributions. We begin by
+introducing a formal theoretical framework to understand verifiability and show
+that attributions produced by standard models cannot be verified. We then
+leverage this framework to propose a method to build verifiable models and
+feature attributions out of fully trained black-box models. Finally, we perform
+extensive experiments on semi-synthetic and real-world datasets, and show that
+VerT produces models that (1) yield explanations that are correct and
+verifiable and (2) are faithful to the original black-box models they are meant
+to explain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Thinker: Learning to Plan and Act 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14993v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14993v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephen Chung, Ivan Anokhin, David Krueger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose the Thinker algorithm, a novel approach that enables reinforcement
+learning agents to autonomously interact with and utilize a learned world
+model. The Thinker algorithm wraps the environment with a world model and
+introduces new actions designed for interacting with the world model. These
+model-interaction actions enable agents to perform planning by proposing
+alternative plans to the world model before selecting a final action to execute
+in the environment. This approach eliminates the need for hand-crafted planning
+algorithms by enabling the agent to learn how to plan autonomously and allows
+for easy interpretation of the agent's plan with visualization. We demonstrate
+the algorithm's effectiveness through experimental results in the game of
+Sokoban and the Atari 2600 benchmark, where the Thinker algorithm achieves
+state-of-the-art performance and competitive results, respectively.
+Visualizations of agents trained with the Thinker algorithm demonstrate that
+they have learned to plan effectively with the world model to select better
+actions. The algorithm's generality opens a new research direction on how a
+world model can be used in reinforcement learning and how planning can be
+seamlessly integrated into an agent's decision-making process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Incrementally-Computable Neural Networks: Efficient Inference for
+  Dynamic Inputs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Or Sharir, Anima Anandkumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning often faces the challenge of efficiently processing dynamic
+inputs, such as sensor data or user inputs. For example, an AI writing
+assistant is required to update its suggestions in real time as a document is
+edited. Re-running the model each time is expensive, even with compression
+techniques like knowledge distillation, pruning, or quantization. Instead, we
+take an incremental computing approach, looking to reuse calculations as the
+inputs change. However, the dense connectivity of conventional architectures
+poses a major obstacle to incremental computation, as even minor input changes
+cascade through the network and restrict information reuse. To address this, we
+use vector quantization to discretize intermediate values in the network, which
+filters out noisy and unnecessary modifications to hidden neurons, facilitating
+the reuse of their values. We apply this approach to the transformers
+architecture, creating an efficient incremental inference algorithm with
+complexity proportional to the fraction of the modified inputs. Our experiments
+with adapting the OPT-125M pre-trained language model demonstrate comparable
+accuracy on document classification while requiring 12.1X (median) fewer
+operations for processing sequences of atomic edits.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Take-A-Photo: 3D-to-2D Generative <span class="highlight-title">Pre-train</span>ing of Point Cloud Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyi Wang, Xumin Yu, Yongming Rao, Jie Zhou, Jiwen Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the overwhelming trend of mask image modeling led by MAE, generative
+pre-training has shown a remarkable potential to boost the performance of
+fundamental models in 2D vision. However, in 3D vision, the over-reliance on
+Transformer-based backbones and the unordered nature of point clouds have
+restricted the further development of generative pre-training. In this paper,
+we propose a novel 3D-to-2D generative pre-training method that is adaptable to
+any point cloud model. We propose to generate view images from different
+instructed poses via the cross-attention mechanism as the pre-training scheme.
+Generating view images has more precise supervision than its point cloud
+counterpart, thus assisting 3D backbones to have a finer comprehension of the
+geometrical structure and stereoscopic relations of the point cloud.
+Experimental results have proved the superiority of our proposed 3D-to-2D
+generative pre-training over previous pre-training methods. Our method is also
+effective in boosting the performance of architecture-oriented approaches,
+achieving state-of-the-art performance when fine-tuning on ScanObjectNN
+classification and ShapeNetPart segmentation tasks. Code is available at
+https://github.com/wangzy22/TAP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023, project page: https://tap.ivg-research.xyz</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning locally dominant force balances in active particle systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Sturm, Suryanarayana Maddu, Ivo F. Sbalzarini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We use a combination of unsupervised clustering and sparsity-promoting
+inference algorithms to learn locally dominant force balances that explain
+macroscopic pattern formation in self-organized active particle systems. The
+self-organized emergence of macroscopic patterns from microscopic interactions
+between self-propelled particles can be widely observed nature. Although
+hydrodynamic theories help us better understand the physical basis of this
+phenomenon, identifying a sufficient set of local interactions that shape,
+regulate, and sustain self-organized structures in active particle systems
+remains challenging. We investigate a classic hydrodynamic model of
+self-propelled particles that produces a wide variety of patterns, like asters
+and moving density bands. Our data-driven analysis shows that propagating bands
+are formed by local alignment interactions driven by density gradients, while
+steady-state asters are shaped by a mechanism of splay-induced negative
+compressibility arising from strong particle interactions. Our method also
+reveals analogous physical principles of pattern formation in a system where
+the speed of the particle is influenced by local density. This demonstrates the
+ability of our method to reveal physical commonalities across models. The
+physical mechanisms inferred from the data are in excellent agreement with
+analytical scaling arguments and experimental observations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Model Aggregation via <span class="highlight-title">Self-Supervised</span> Priors for Highly
+  Imbalanced Medical Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14959v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14959v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marawan Elbatel, Hualiang Wang, Robert Martí, Huazhu Fu, Xiaomeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the medical field, federated learning commonly deals with highly
+imbalanced datasets, including skin lesions and gastrointestinal images.
+Existing federated methods under highly imbalanced datasets primarily focus on
+optimizing a global model without incorporating the intra-class variations that
+can arise in medical imaging due to different populations, findings, and
+scanners. In this paper, we study the inter-client intra-class variations with
+publicly available self-supervised auxiliary networks. Specifically, we find
+that employing a shared auxiliary pre-trained model, like MoCo-V2, locally on
+every client yields consistent divergence measurements. Based on these
+findings, we derive a dynamic balanced model aggregation via self-supervised
+priors (MAS) to guide the global model optimization. Fed-MAS can be utilized
+with different local learning methods for effective model aggregation toward a
+highly robust and unbiased global model. Our code is available at
+\url{https://github.com/xmed-lab/Fed-MAS}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Source Domain Adaptation through <span class="highlight-title">Dataset</span> Dictionary Learning in
+  Wasserstein Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14953v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14953v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduardo Fernandes Montesuma, Fred Ngolè Mboula, Antoine Souloumiac
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper seeks to solve Multi-Source Domain Adaptation (MSDA), which aims
+to mitigate data distribution shifts when transferring knowledge from multiple
+labeled source domains to an unlabeled target domain. We propose a novel MSDA
+framework based on dictionary learning and optimal transport. We interpret each
+domain in MSDA as an empirical distribution. As such, we express each domain as
+a Wasserstein barycenter of dictionary atoms, which are empirical
+distributions. We propose a novel algorithm, DaDiL, for learning via
+mini-batches: (i) atom distributions; (ii) a matrix of barycentric coordinates.
+Based on our dictionary, we propose two novel methods for MSDA: DaDil-R, based
+on the reconstruction of labeled samples in the target domain, and DaDiL-E,
+based on the ensembling of classifiers learned on atom distributions. We
+evaluate our methods in 3 benchmarks: Caltech-Office, Office 31, and CRWU,
+where we improved previous state-of-the-art by 3.15%, 2.29%, and 7.71% in
+classification performance. Finally, we show that interpolations in the
+Wasserstein hull of learned atoms provide data that can generalize to the
+target domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages,9 figures,Accepted as a conference paper at the 26th
+  European Conference on Artificial Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Network Fault-tolerant and Byzantine-resilient Social Learning via
+  Collaborative Hierarchical Non-Bayesian Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14952v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14952v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Connor Mclaughlin, Matthew Ding, Denis Edogmus, Lili Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the network scale increases, existing fully distributed solutions start to
+lag behind the real-world challenges such as (1) slow information propagation,
+(2) network communication failures, and (3) external adversarial attacks. In
+this paper, we focus on hierarchical system architecture and address the
+problem of non-Bayesian learning over networks that are vulnerable to
+communication failures and adversarial attacks. On network communication, we
+consider packet-dropping link failures.
+  We first propose a hierarchical robust push-sum algorithm that can achieve
+average consensus despite frequent packet-dropping link failures. We provide a
+sparse information fusion rule between the parameter server and arbitrarily
+selected network representatives. Then, interleaving the consensus update step
+with a dual averaging update with Kullback-Leibler (KL) divergence as the
+proximal function, we obtain a packet-dropping fault-tolerant non-Bayesian
+learning algorithm with provable convergence guarantees.
+  On external adversarial attacks, we consider Byzantine attacks in which the
+compromised agents can send maliciously calibrated messages to others
+(including both the agents and the parameter server). To avoid the curse of
+dimensionality of Byzantine consensus, we solve the non-Bayesian learning
+problem via running multiple dynamics, each of which only involves Byzantine
+consensus with scalar inputs. To facilitate resilient information propagation
+across sub-networks, we use a novel Byzantine-resilient gossiping-type rule at
+the parameter server.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Self-Adaptive Penalty Method for Integrating Prior Knowledge
+  Constraints into Neural ODEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14940v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14940v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        C. Coelho, M. Fernanda P. Costa, L. L. Ferrás
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The continuous dynamics of natural systems has been effectively modelled
+using Neural Ordinary Differential Equations (Neural ODEs). However, for
+accurate and meaningful predictions, it is crucial that the models follow the
+underlying rules or laws that govern these systems. In this work, we propose a
+self-adaptive penalty algorithm for Neural ODEs to enable modelling of
+constrained natural systems. The proposed self-adaptive penalty function can
+dynamically adjust the penalty parameters. The explicit introduction of prior
+knowledge helps to increase the interpretability of Neural ODE -based models.
+We validate the proposed approach by modelling three natural systems with prior
+knowledge constraints: population growth, chemical reaction evolution, and
+damped harmonic oscillator motion. The numerical experiments and a comparison
+with other penalty Neural ODE approaches and \emph{vanilla} Neural ODE,
+demonstrate the effectiveness of the proposed self-adaptive penalty algorithm
+for Neural ODEs in modelling constrained natural systems. Moreover, the
+self-adaptive penalty approach provides more accurate and robust models with
+reliable and meaningful predictions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Interaction-Aware Interval Analysis of Neural Network Feedback
+  Loops 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saber Jafarpour, Akash Harapanahalli, Samuel Coogan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a computationally efficient framework for interval
+reachability of neural network controlled systems. Our approach builds upon
+inclusion functions for the neural network controller and the open-loop system.
+We observe that many state-of-the-art neural network verifiers can produce
+inclusion functions for neural networks. We introduce and analyze a new class
+of inclusion functions for the open-loop dynamics based on bounds of the
+function Jacobian that is particularly suitable for capturing the interactions
+between systems and neural network controllers. Next, for any dynamical system,
+we use inclusion functions to construct an embedding system with twice the
+number of states as the original system. We show that a single trajectory of
+this embedding system provides hyper-rectangular over-approximations of
+reachable sets. We then propose two approaches for constructing a closed-loop
+embedding system for a neural network controlled dynamical system that accounts
+for the interaction between the system and the controller in different ways.
+The interconnection-based approach accounts for the worst-case evolution of
+each coordinate separately by substituting the neural network inclusion
+function into the open-loop embedding system. The interaction-based approach
+uses the newly introduced class of Jacobian-based inclusion functions to fully
+capture first-order interactions between the system and the controller.
+Finally, we implement our approach in a Python framework called
+\texttt{ReachMM} and show that on several existing benchmarks, our methods
+outperform the existing approaches in the literature. We also demonstrate the
+scalability of our method on a vehicle platooning example with up to $200$
+states.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PanGu-Coder2: Boosting Large Language Models for Code with Ranking
+  Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14936v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14936v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Shen, Jiaxin Zhang, Taihong Chen, Daoguang Zan, Bing Geng, An Fu, Muhan Zeng, Ailun Yu, Jichuan Ji, Jingyang Zhao, Yuenan Guo, Qianxiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models for Code (Code LLM) are flourishing. New and powerful
+models are released on a weekly basis, demonstrating remarkable performance on
+the code generation task. Various approaches have been proposed to boost the
+code generation performance of pre-trained Code LLMs, such as supervised
+fine-tuning, instruction tuning, reinforcement learning, etc. In this paper, we
+propose a novel RRTF (Rank Responses to align Test&Teacher Feedback) framework,
+which can effectively and efficiently boost pre-trained large language models
+for code generation. Under this framework, we present PanGu-Coder2, which
+achieves 62.20% pass@1 on the OpenAI HumanEval benchmark. Furthermore, through
+an extensive evaluation on CoderEval and LeetCode benchmarks, we show that
+PanGu-Coder2 consistently outperforms all previous Code LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Solving Data Quality Problems with Desbordante: a Demo 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14935v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14935v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George Chernishev, Michael Polyntsov, Anton Chizhov, Kirill Stupakov, Ilya Shchuckin, Alexander Smirnov, Maxim Strutovsky, Alexey Shlyonskikh, Mikhail Firsov, Stepan Manannikov, Nikita Bobrov, Daniil Goncharov, Ilia Barutkin, Vladislav Shalnev, Kirill Muraviev, Anna Rakhmukova, Dmitriy Shcheka, Anton Chernikov, Mikhail Vyrodov, Kurbatov Yaroslav, Maxim Fofanov, Belokonnyi Sergei, Anosov Pavel, Arthur Saliou, Eduard Gaisin, Kirill Smirnov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data profiling is an essential process in modern data-driven industries. One
+of its critical components is the discovery and validation of complex
+statistics, including functional dependencies, data constraints, association
+rules, and others.
+  However, most existing data profiling systems that focus on complex
+statistics do not provide proper integration with the tools used by
+contemporary data scientists. This creates a significant barrier to the
+adoption of these tools in the industry. Moreover, existing systems were not
+created with industrial-grade workloads in mind. Finally, they do not aim to
+provide descriptive explanations, i.e. why a given pattern is not found. It is
+a significant issue as it is essential to understand the underlying reasons for
+a specific pattern's absence to make informed decisions based on the data.
+  Because of that, these patterns are effectively rest in thin air: their
+application scope is rather limited, they are rarely used by the broader
+public. At the same time, as we are going to demonstrate in this presentation,
+complex statistics can be efficiently used to solve many classic data quality
+problems.
+  Desbordante is an open-source data profiler that aims to close this gap. It
+is built with emphasis on industrial application: it is efficient, scalable,
+resilient to crashes, and provides explanations. Furthermore, it provides
+seamless Python integration by offloading various costly operations to the C++
+core, not only mining.
+  In this demonstration, we show several scenarios that allow end users to
+solve different data quality problems. Namely, we showcase typo detection, data
+deduplication, and data anomaly detection scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph-based Polyphonic Multitrack Music Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emanuele Cosenza, Andrea Valenti, Davide Bacciu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graphs can be leveraged to model polyphonic multitrack symbolic music, where
+notes, chords and entire sections may be linked at different levels of the
+musical hierarchy by tonal and rhythmic relationships. Nonetheless, there is a
+lack of works that consider graph representations in the context of deep
+learning systems for music generation. This paper bridges this gap by
+introducing a novel graph representation for music and a deep Variational
+Autoencoder that generates the structure and the content of musical graphs
+separately, one after the other, with a hierarchical architecture that matches
+the structural priors of music. By separating the structure and content of
+musical graphs, it is possible to condition generation by specifying which
+instruments are played at certain times. This opens the door to a new form of
+human-computer interaction in the context of music co-creation. After training
+the model on existing MIDI datasets, the experiments show that the model is
+able to generate appealing short and long musical sequences and to
+realistically interpolate between them, producing music that is tonally and
+rhythmically consistent. Finally, the visualization of the embeddings shows
+that the model is able to organize its latent space in accordance with known
+musical concepts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking Performance of Deep Learning Model for Material
+  Segmentation on Two HPC Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14921v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14921v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Warren R. Williams, S. Ross Glandon, Luke L. Morris, Jing-Ru C. Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Performance Benchmarking of HPC systems is an ongoing effort that seeks to
+provide information that will allow for increased performance and improve the
+job schedulers that manage these systems. We develop a benchmarking tool that
+utilizes machine learning models and gathers performance data on
+GPU-accelerated nodes while they perform material segmentation analysis. The
+benchmark uses a ML model that has been converted from Caffe to PyTorch using
+the MMdnn toolkit and the MINC-2500 dataset. Performance data is gathered on
+two ERDC DSRC systems, Onyx and Vulcanite. The data reveals that while
+Vulcanite has faster model times in a large number of benchmarks, and it is
+also more subject to some environmental factors that can cause performances
+slower than Onyx. In contrast the model times from Onyx are consistent across
+benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NSA: Naturalistic Support Artifact to Boost Network Confidence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhijith Sharma, Phil Munz, Apurva Narayan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual AI systems are vulnerable to natural and synthetic physical corruption
+in the real-world. Such corruption often arises unexpectedly and alters the
+model's performance. In recent years, the primary focus has been on adversarial
+attacks. However, natural corruptions (e.g., snow, fog, dust) are an
+omnipresent threat to visual AI systems and should be considered equally
+important. Many existing works propose interesting solutions to train robust
+models against natural corruption. These works either leverage image
+augmentations, which come with the additional cost of model training, or place
+suspicious patches in the scene to design unadversarial examples. In this work,
+we propose the idea of naturalistic support artifacts (NSA) for robust
+prediction. The NSAs are shown to be beneficial in scenarios where model
+parameters are inaccessible and adding artifacts in the scene is feasible. The
+NSAs are natural looking objects generated through artifact training using
+DC-GAN to have high visual fidelity in the scene. We test against natural
+corruptions on the Imagenette dataset and observe the improvement in prediction
+confidence score by four times. We also demonstrate NSA's capability to
+increase adversarial accuracy by 8\% on average. Lastly, we qualitatively
+analyze NSAs using saliency maps to understand how they help improve prediction
+confidence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Session-Based <span class="highlight-title">Transformer</span> Recommendations using Optimized
+  Negative Sampling and Loss Functions <span class="chip">RecSys '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timo Wilm, Philipp Normann, Sophie Baumeister, Paul-Vincent Kobow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work introduces TRON, a scalable session-based Transformer Recommender
+using Optimized Negative-sampling. Motivated by the scalability and performance
+limitations of prevailing models such as SASRec and GRU4Rec+, TRON integrates
+top-k negative sampling and listwise loss functions to enhance its
+recommendation accuracy. Evaluations on relevant large-scale e-commerce
+datasets show that TRON improves upon the recommendation quality of current
+methods while maintaining training speeds similar to SASRec. A live A/B test
+yielded an 18.14% increase in click-through rate over SASRec, highlighting the
+potential of TRON in practical settings. For further research, we provide
+access to our source code at https://github.com/otto-de/TRON and an anonymized
+dataset at https://github.com/otto-de/recsys-dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the Seventeenth ACM Conference on Recommender Systems
+  (RecSys '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CodeLens: An Interactive Tool for Visualizing Code Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14902v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14902v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuejun Guo, Seifeddine Bettaieb, Qiang Hu, Yves Le Traon, Qiang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representing source code in a generic input format is crucial to automate
+software engineering tasks, e.g., applying machine learning algorithms to
+extract information. Visualizing code representations can further enable human
+experts to gain an intuitive insight into the code. Unfortunately, as of today,
+there is no universal tool that can simultaneously visualise different types of
+code representations. In this paper, we introduce a tool, CodeLens, which
+provides a visual interaction environment that supports various representation
+methods and helps developers understand and explore them. CodeLens is designed
+to support multiple programming languages, such as Java, Python, and
+JavaScript, and four types of code representations, including sequence of
+tokens, abstract syntax tree (AST), data flow graph (DFG), and control flow
+graph (CFG). By using CodeLens, developers can quickly visualize the specific
+code representation and also obtain the represented inputs for models of code.
+The Web-based interface of CodeLens is available at http://www.codelens.org.
+The demonstration video can be found at http://www.codelens.org/demo.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative convective parametrization of dry atmospheric boundary layer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14857v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14857v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Heyder, Juan Pedro Mellado, Jörg Schumacher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Turbulence parametrizations will remain a necessary building block in
+kilometer-scale Earth system models. In convective boundary layers, where the
+mean vertical gradients of conserved properties such as potential temperature
+and moisture are approximately zero, the standard ansatz which relates
+turbulent fluxes to mean vertical gradients via an eddy diffusivity has to be
+extended by mass flux parametrizations for the typically asymmetric up- and
+downdrafts in the atmospheric boundary layer. In this work, we present a
+parametrization for a dry convective boundary layer based on a generative
+adversarial network. The model incorporates the physics of self-similar layer
+growth following from the classical mixed layer theory by Deardorff. This
+enhances the training data base of the generative machine learning algorithm
+and thus significantly improves the predicted statistics of the synthetically
+generated turbulence fields at different heights inside the boundary layer. The
+algorithm training is based on fully three-dimensional direct numerical
+simulation data. Differently to stochastic parametrizations, our model is able
+to predict the highly non-Gaussian transient statistics of buoyancy
+fluctuations, vertical velocity, and buoyancy flux at different heights thus
+also capturing the fastest thermals penetrating into the stabilized top region.
+The results of our generative algorithm agree with standard two-equation or
+multi-plume stochastic mass-flux schemes. The present parametrization provides
+additionally the granule-type horizontal organization of the turbulent
+convection which cannot be obtained in any of the other model closures. Our
+work paves the way to efficient data-driven convective parametrizations in
+other natural flows, such as moist convection, upper ocean mixing, or
+convection in stellar interiors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Counterfactual Explanations for Graph Classification Through the Lenses
+  of Density 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14849v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14849v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlo Abrate, Giulia Preti, Francesco Bonchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactual examples have emerged as an effective approach to produce
+simple and understandable post-hoc explanations. In the context of graph
+classification, previous work has focused on generating counterfactual
+explanations by manipulating the most elementary units of a graph, i.e.,
+removing an existing edge, or adding a non-existing one. In this paper, we
+claim that such language of explanation might be too fine-grained, and turn our
+attention to some of the main characterizing features of real-world complex
+networks, such as the tendency to close triangles, the existence of recurring
+motifs, and the organization into dense modules. We thus define a general
+density-based counterfactual search framework to generate instance-level
+counterfactual explanations for graph classifiers, which can be instantiated
+with different notions of dense substructures. In particular, we show two
+specific instantiations of this general framework: a method that searches for
+counterfactual graphs by opening or closing triangles, and a method driven by
+maximal cliques. We also discuss how the general method can be instantiated to
+exploit any other notion of dense substructures, including, for instance, a
+given taxonomy of nodes. We evaluate the effectiveness of our approaches in 7
+brain network datasets and compare the counterfactual statements generated
+according to several widely-used metrics. Results confirm that adopting a
+semantic-relevant unit of change like density is essential to define versatile
+and interpretable counterfactual explanation methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kernelised Normalising Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14839v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14839v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eshant English, Matthias Kirchler, Christoph Lippert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Normalising Flows are generative models characterised by their invertible
+architecture. However, the requirement of invertibility imposes constraints on
+their expressiveness, necessitating a large number of parameters and innovative
+architectural designs to achieve satisfactory outcomes. Whilst flow-based
+models predominantly rely on neural-network-based transformations for
+expressive designs, alternative transformation methods have received limited
+attention. In this work, we present Ferumal flow, a novel kernelised
+normalising flow paradigm that integrates kernels into the framework. Our
+results demonstrate that a kernelised flow can yield competitive or superior
+results compared to neural network-based flows whilst maintaining parameter
+efficiency. Kernelised flows excel especially in the low-data regime, enabling
+flexible non-parametric density estimation in applications with sparse data
+availability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fading memory as inductive bias in residual recurrent networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Igor Dubinin, Felix Effenberger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Residual connections have been proposed as architecture-based inductive bias
+to mitigate the problem of exploding and vanishing gradients and increase task
+performance in both feed-forward and recurrent networks (RNNs) when trained
+with the backpropagation algorithm. Yet, little is known about how residual
+connections in RNNs influence their dynamics and fading memory properties.
+Here, we introduce weakly coupled residual recurrent networks (WCRNNs) in which
+residual connections result in well-defined Lyapunov exponents and allow for
+studying properties of fading memory. We investigate how the residual
+connections of WCRNNs influence their performance, network dynamics, and memory
+properties on a set of benchmark tasks. We show that several distinct forms of
+residual connections yield effective inductive biases that result in increased
+network expressivity. In particular, residual connections that (i) result in
+network dynamics at the proximity of the edge of chaos, (ii) allow networks to
+capitalize on characteristic spectral properties of the data, and (iii) result
+in heterogeneous memory properties are shown to increase practical
+expressivity. In addition, we demonstrate how our results can be extended to
+non-linear residuals and introduce a weakly coupled residual initialization
+scheme that can be used for Elman RNNs
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Likely, Light, and Accurate Context-Free Clusters-based Trajectory
+  Prediction <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14788v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14788v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tiago Rodrigues de Almeida, Oscar Martinez Mozos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous systems in the road transportation network require intelligent
+mechanisms that cope with uncertainty to foresee the future. In this paper, we
+propose a multi-stage probabilistic approach for trajectory forecasting:
+trajectory transformation to displacement space, clustering of displacement
+time series, trajectory proposals, and ranking proposals. We introduce a new
+deep feature clustering method, underlying self-conditioned GAN, which copes
+better with distribution shifts than traditional methods. Additionally, we
+propose novel distance-based ranking proposals to assign probabilities to the
+generated trajectories that are more efficient yet accurate than an auxiliary
+neural network. The overall system surpasses context-free deep generative
+models in human and road agents trajectory data while performing similarly to
+point estimators when comparing the most probable trajectory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted to the 26th IEEE International
+  Conference on Intelligent Transportation Systems (ITSC 2023), which will be
+  held in Bilbao, Spain on September 24-28, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emotion4MIDI: a Lyrics-based Emotion-Labeled Symbolic Music <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serkan Sulun, Pedro Oliveira, Paula Viana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new large-scale emotion-labeled symbolic music dataset
+consisting of 12k MIDI songs. To create this dataset, we first trained emotion
+classification models on the GoEmotions dataset, achieving state-of-the-art
+results with a model half the size of the baseline. We then applied these
+models to lyrics from two large-scale MIDI datasets. Our dataset covers a wide
+range of fine-grained emotions, providing a valuable resource to explore the
+connection between music and emotions and, especially, to develop models that
+can generate music based on specific emotions. Our code for inference, trained
+models, and datasets are available online.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 22nd EPIA Conference on Artificial Intelligence (2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MATNilm: Multi-appliance-task Non-intrusive Load Monitoring with Limited
+  Labeled Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14778v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14778v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Xiong, Tianqi Hong, Dongbo Zhao, Yu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non-intrusive load monitoring (NILM) identifies the status and power
+consumption of various household appliances by disaggregating the total power
+usage signal of an entire house. Efficient and accurate load monitoring
+facilitates user profile establishment, intelligent household energy
+management, and peak load shifting. This is beneficial for both the end-users
+and utilities by improving the overall efficiency of a power distribution
+network. Existing approaches mainly focus on developing an individual model for
+each appliance. Those approaches typically rely on a large amount of
+household-labeled data which is hard to collect. In this paper, we propose a
+multi-appliance-task framework with a training-efficient sample augmentation
+(SA) scheme that boosts the disaggregation performance with limited labeled
+data. For each appliance, we develop a shared-hierarchical split structure for
+its regression and classification tasks. In addition, we also propose a
+two-dimensional attention mechanism in order to capture spatio-temporal
+correlations among all appliances. With only one-day training data and limited
+appliance operation profiles, the proposed SA algorithm can achieve comparable
+test performance to the case of training with the full dataset. Finally,
+simulation results show that our proposed approach features a significantly
+improved performance over many baseline models. The relative errors can be
+reduced by more than 50\% on average. The codes of this work are available at
+https://github.com/jxiong22/MATNilm
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Practicable Sequential Shift Detectors <span class="chip">ICML 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14758v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14758v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oliver Cobb, Arnaud Van Looveren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a growing awareness of the harmful effects of distribution shift on
+the performance of deployed machine learning models. Consequently, there is a
+growing interest in detecting these shifts before associated costs have time to
+accumulate. However, desiderata of crucial importance to the practicable
+deployment of sequential shift detectors are typically overlooked by existing
+works, precluding their widespread adoption. We identify three such desiderata,
+highlight existing works relevant to their satisfaction, and recommend
+impactful directions for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2022 Workshop on Principles of Distribution Shift (PODS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fair Machine Unlearning: Data Removal while Mitigating Disparities <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14754v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14754v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Oesterling, Jiaqi Ma, Flavio P. Calmon, Hima Lakkaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As public consciousness regarding the collection and use of personal
+information by corporations grows, it is of increasing importance that
+consumers be active participants in the curation of corporate datasets. In
+light of this, data governance frameworks such as the General Data Protection
+Regulation (GDPR) have outlined the right to be forgotten as a key principle
+allowing individuals to request that their personal data be deleted from the
+databases and models used by organizations. To achieve forgetting in practice,
+several machine unlearning methods have been proposed to address the
+computational inefficiencies of retraining a model from scratch with each
+unlearning request. While efficient online alternatives to retraining, it is
+unclear how these methods impact other properties critical to real-world
+applications, such as fairness. In this work, we propose the first fair machine
+unlearning method that can provably and efficiently unlearn data instances
+while preserving group fairness. We derive theoretical results which
+demonstrate that our method can provably unlearn data instances while
+maintaining fairness objectives. Extensive experimentation with real-world
+datasets highlight the efficacy of our method at unlearning data instances
+while preserving fairness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 3 figures, accepted to ICML 2023 DMLR Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLARE: Fingerprinting Deep Reinforcement Learning Agents using Universal
+  Adversarial Masks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Buse G. A. Tekgul, N. Asokan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose FLARE, the first fingerprinting mechanism to verify whether a
+suspected Deep Reinforcement Learning (DRL) policy is an illegitimate copy of
+another (victim) policy. We first show that it is possible to find
+non-transferable, universal adversarial masks, i.e., perturbations, to generate
+adversarial examples that can successfully transfer from a victim policy to its
+modified versions but not to independently trained policies. FLARE employs
+these masks as fingerprints to verify the true ownership of stolen DRL policies
+by measuring an action agreement value over states perturbed via such masks.
+Our empirical evaluations show that FLARE is effective (100% action agreement
+on stolen copies) and does not falsely accuse independent policies (no false
+positives). FLARE is also robust to model modification attacks and cannot be
+easily evaded by more informed adversaries without negatively impacting agent
+performance. We also show that not all universal adversarial masks are suitable
+candidates for fingerprints due to the inherent characteristics of DRL
+policies. The spatio-temporal dynamics of DRL problems and sequential
+decision-making process make characterizing the decision boundary of DRL
+policies more difficult, as well as searching for universal masks that capture
+the geometry of it.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 5 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Image Completion and Enhancement using GANs <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14748v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14748v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Priyansh Saxena, Raahat Gupta, Akshat Maheshwari, Saumil Maheshwari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic inpainting or image completion alludes to the task of inferring
+arbitrary large missing regions in images based on image semantics. Since the
+prediction of image pixels requires an indication of high-level context, this
+makes it significantly tougher than image completion, which is often more
+concerned with correcting data corruption and removing entire objects from the
+input image. On the other hand, image enhancement attempts to eliminate
+unwanted noise and blur from the image, along with sustaining most of the image
+details. Efficient image completion and enhancement model should be able to
+recover the corrupted and masked regions in images and then refine the image
+further to increase the quality of the output image. Generative Adversarial
+Networks (GAN), have turned out to be helpful in picture completion tasks. In
+this chapter, we will discuss the underlying GAN architecture and how they can
+be used used for image completion tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is part of 'High-Performance Vision Intelligence'; Part of
+  the Studies in Computational Intelligence book series (SCI, volume 913) and
+  can be accessed at:
+  https://link.springer.com/chapter/10.1007/978-981-15-6844-2_11. arXiv admin
+  note: substantial text overlap with arXiv:1911.02222</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Strategic Framework for Optimal Decisions in Football 1-vs-1
+  Shot-Taking Situations: An Integrated Approach of Machine Learning,
+  Theory-Based Modeling, and Game Theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Calvin C. K. Yeung, Keisuke Fujii
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Complex interactions between two opposing agents frequently occur in domains
+of machine learning, game theory, and other application domains. Quantitatively
+analyzing the strategies involved can provide an objective basis for
+decision-making. One such critical scenario is shot-taking in football, where
+decisions, such as whether the attacker should shoot or pass the ball and
+whether the defender should attempt to block the shot, play a crucial role in
+the outcome of the game. However, there are currently no effective data-driven
+and/or theory-based approaches to analyzing such situations. To address this
+issue, we proposed a novel framework to analyze such scenarios based on game
+theory, where we estimate the expected payoff with machine learning (ML)
+models, and additional features for ML models were extracted with a
+theory-based shot block model. Conventionally, successes or failures (1 or 0)
+are used as payoffs, while a success shot (goal) is extremely rare in football.
+Therefore, we proposed the Expected Probability of Shot On Target (xSOT) metric
+to evaluate players' actions even if the shot results in no goal; this allows
+for effective differentiation and comparison between different shots and even
+enables counterfactual shot situation analysis. In our experiments, we have
+validated the framework by comparing it with baseline and ablated models.
+Furthermore, we have observed a high correlation between the xSOT and existing
+metrics. This alignment of information suggests that xSOT provides valuable
+insights. Lastly, as an illustration, we studied optimal strategies in the
+World Cup 2022 and analyzed a shot situation in EURO 2020.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Silent Failures in Medical Image Classification <span class="chip">MICCAI 23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Till J. Bungert, Levin Kobelke, Paul F. Jaeger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To ensure the reliable use of classification systems in medical applications,
+it is crucial to prevent silent failures. This can be achieved by either
+designing classifiers that are robust enough to avoid failures in the first
+place, or by detecting remaining failures using confidence scoring functions
+(CSFs). A predominant source of failures in image classification is
+distribution shifts between training data and deployment data. To understand
+the current state of silent failure prevention in medical imaging, we conduct
+the first comprehensive analysis comparing various CSFs in four biomedical
+tasks and a diverse range of distribution shifts. Based on the result that none
+of the benchmarked CSFs can reliably prevent silent failures, we conclude that
+a deeper understanding of the root causes of failures in the data is required.
+To facilitate this, we introduce SF-Visuals, an interactive analysis tool that
+uses latent space clustering to visualize shifts and failures. On the basis of
+various examples, we demonstrate how this tool can help researchers gain
+insight into the requirements for safe application of classification systems in
+the medical domain. The open-source benchmark and tool are at:
+https://github.com/IML-DKFZ/sf-visuals.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Effect of Spoken Language on Speech Enhancement using
+  <span class="highlight-title">Self-Supervised</span> Speech Representation Loss Functions <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14502v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14502v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George Close, Thomas Hain, Stefan Goetze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work in the field of speech enhancement (SE) has involved the use of
+self-supervised speech representations (SSSRs) as feature transformations in
+loss functions. However, in prior work, very little attention has been paid to
+the relationship between the language of the audio used to train the
+self-supervised representation and that used to train the SE system.
+Enhancement models trained using a loss function which incorporates a
+self-supervised representation that shares exactly the language of the noisy
+data used to train the SE system show better performance than those which do
+not match exactly. This may lead to enhancement systems which are language
+specific and as such do not generalise well to unseen languages, unlike models
+trained using traditional spectrogram or time domain loss functions. In this
+work, SE models are trained and tested on a number of different languages, with
+self-supervised representations which themselves are trained using different
+language combinations and with differing network structures as loss function
+representations. These models are then tested across unseen languages and their
+performances are analysed. It is found that the training language of the
+self-supervised representation appears to have a minor effect on enhancement
+performance, the amount of training data of a particular language, however,
+greatly affects performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at WASPAA 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TimeGNN: Temporal Dynamic Graph Learning for Time Series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nancy Xu, Chrysoula Kosma, Michalis Vazirgiannis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series forecasting lies at the core of important real-world applications
+in many fields of science and engineering. The abundance of large time series
+datasets that consist of complex patterns and long-term dependencies has led to
+the development of various neural network architectures. Graph neural network
+approaches, which jointly learn a graph structure based on the correlation of
+raw values of multivariate time series while forecasting, have recently seen
+great success. However, such solutions are often costly to train and difficult
+to scale. In this paper, we propose TimeGNN, a method that learns dynamic
+temporal graph representations that can capture the evolution of inter-series
+patterns along with the correlations of multiple series. TimeGNN achieves
+inference times 4 to 80 times faster than other state-of-the-art graph-based
+methods while achieving comparable forecasting performance
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prediction of wind turbines power with physics-informed neural networks
+  and evidential uncertainty quantification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alfonso Gijón, Ainhoa Pujana-Goitia, Eugenio Perea, Miguel Molina-Solana, Juan Gómez-Romero
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ever-growing use of wind energy makes necessary the optimization of
+turbine operations through pitch angle controllers and their maintenance with
+early fault detection. It is crucial to have accurate and robust models
+imitating the behavior of wind turbines, especially to predict the generated
+power as a function of the wind speed. Existing empirical and physics-based
+models have limitations in capturing the complex relations between the input
+variables and the power, aggravated by wind variability. Data-driven methods
+offer new opportunities to enhance wind turbine modeling of large datasets by
+improving accuracy and efficiency. In this study, we used physics-informed
+neural networks to reproduce historical data coming from 4 turbines in a wind
+farm, while imposing certain physical constraints to the model. The developed
+models for regression of the power, torque, and power coefficient as output
+variables showed great accuracy for both real data and physical equations
+governing the system. Lastly, introducing an efficient evidential layer
+provided uncertainty estimations of the predictions, proved to be consistent
+with the absolute error, and made possible the definition of a confidence
+interval in the power curve.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bipartite Ranking Fairness through a Model Agnostic Ordering Adjustment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14668v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14668v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sen Cui, Weishen Pan, Changshui Zhang, Fei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Algorithmic fairness has been a serious concern and received lots of interest
+in machine learning community. In this paper, we focus on the bipartite ranking
+scenario, where the instances come from either the positive or negative class
+and the goal is to learn a ranking function that ranks positive instances
+higher than negative ones. While there could be a trade-off between fairness
+and performance, we propose a model agnostic post-processing framework xOrder
+for achieving fairness in bipartite ranking and maintaining the algorithm
+classification performance. In particular, we optimize a weighted sum of the
+utility as identifying an optimal warping path across different protected
+groups and solve it through a dynamic programming process. xOrder is compatible
+with various classification models and ranking fairness metrics, including
+supervised and unsupervised fairness metrics. In addition to binary groups,
+xOrder can be applied to multiple protected groups. We evaluate our proposed
+algorithm on four benchmark data sets and two real-world patient electronic
+health record repositories. xOrder consistently achieves a better balance
+between the algorithm utility and ranking fairness on a variety of datasets
+with different metrics. From the visualization of the calibrated ranking
+scores, xOrder mitigates the score distribution shifts of different groups
+compared with baselines. Moreover, additional analytical results verify that
+xOrder achieves a robust performance when faced with fewer samples and a bigger
+difference between training and testing ranking score distributions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by IEEE Transactions on Pattern Analysis and
+  Machine Intelligence. arXiv admin note: substantial text overlap with
+  arXiv:2006.08267</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decoding the Secrets of Machine Learning in Malware Classification: A
+  Deep Dive into <span class="highlight-title">Dataset</span>s, Feature Extraction, and Model Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14657v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14657v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Savino Dambra, Yufei Han, Simone Aonzo, Platon Kotzias, Antonino Vitale, Juan Caballero, Davide Balzarotti, Leyla Bilge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many studies have proposed machine-learning (ML) models for malware detection
+and classification, reporting an almost-perfect performance. However, they
+assemble ground-truth in different ways, use diverse static- and
+dynamic-analysis techniques for feature extraction, and even differ on what
+they consider a malware family. As a consequence, our community still lacks an
+understanding of malware classification results: whether they are tied to the
+nature and distribution of the collected dataset, to what extent the number of
+families and samples in the training dataset influence performance, and how
+well static and dynamic features complement each other.
+  This work sheds light on those open questions. by investigating the key
+factors influencing ML-based malware detection and classification. For this, we
+collect the largest balanced malware dataset so far with 67K samples from 670
+families (100 samples each), and train state-of-the-art models for malware
+detection and family classification using our dataset. Our results reveal that
+static features perform better than dynamic features, and that combining both
+only provides marginal improvement over static features. We discover no
+correlation between packing and classification accuracy, and that missing
+behaviors in dynamically-extracted features highly penalize their performance.
+We also demonstrate how a larger number of families to classify make the
+classification harder, while a higher number of samples per family increases
+accuracy. Finally, we find that models trained on a uniform distribution of
+samples per family better generalize on unseen data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine Learning based Parameter Sensitivity of Regional Climate Models
+  -- A Case Study of the WRF Model for Heat Extremes over Southeast Australia 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        P. Jyoteeshkumar Reddy, Sandeep Chinta, Richard Matear, John Taylor, Harish Baki, Marcus Thatcher, Jatin Kala, Jason Sharples
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Heatwaves and bushfires cause substantial impacts on society and ecosystems
+across the globe. Accurate information of heat extremes is needed to support
+the development of actionable mitigation and adaptation strategies. Regional
+climate models are commonly used to better understand the dynamics of these
+events. These models have very large input parameter sets, and the parameters
+within the physics schemes substantially influence the model's performance.
+However, parameter sensitivity analysis (SA) of regional models for heat
+extremes is largely unexplored. Here, we focus on the southeast Australian
+region, one of the global hotspots of heat extremes. In southeast Australia
+Weather Research and Forecasting (WRF) model is the widely used regional model
+to simulate extreme weather events across the region. Hence in this study, we
+focus on the sensitivity of WRF model parameters to surface meteorological
+variables such as temperature, relative humidity, and wind speed during two
+extreme heat events over southeast Australia. Due to the presence of multiple
+parameters and their complex relationship with output variables, a machine
+learning (ML) surrogate-based global sensitivity analysis method is considered
+for the SA. The ML surrogate-based Sobol SA is used to identify the sensitivity
+of 24 adjustable parameters in seven different physics schemes of the WRF
+model. Results show that out of these 24, only three parameters, namely the
+scattering tuning parameter, multiplier of saturated soil water content, and
+profile shape exponent in the momentum diffusivity coefficient, are important
+for the considered meteorological variables. These SA results are consistent
+for the two different extreme heat events. Further, we investigated the
+physical significance of sensitive parameters. This study's results will help
+in further optimising WRF parameters to improve model simulation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Speed Limits for Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14653v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14653v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Inbar Seroussi, Alexander A. Alemi, Moritz Helias, Zohar Ringel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art neural networks require extreme computational power to
+train. It is therefore natural to wonder whether they are optimally trained.
+Here we apply a recent advancement in stochastic thermodynamics which allows
+bounding the speed at which one can go from the initial weight distribution to
+the final distribution of the fully trained network, based on the ratio of
+their Wasserstein-2 distance and the entropy production rate of the dynamical
+process connecting them. Considering both gradient-flow and Langevin training
+dynamics, we provide analytical expressions for these speed limits for linear
+and linearizable neural networks e.g. Neural Tangent Kernel (NTK). Remarkably,
+given some plausible scaling assumptions on the NTK spectra and spectral
+decomposition of the labels -- learning is optimal in a scaling sense. Our
+results are consistent with small-scale experiments with Convolutional Neural
+Networks (CNNs) and Fully Connected Neural networks (FCNs) on CIFAR-10, showing
+a short highly non-optimal regime followed by a longer optimal regime.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatial-Frequency U-Net for Denoising Diffusion Probabilistic Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Yuan, Linjie Li, Jianfeng Wang, Zhengyuan Yang, Kevin Lin, Zicheng Liu, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study the denoising diffusion probabilistic model (DDPM) in
+wavelet space, instead of pixel space, for visual synthesis. Considering the
+wavelet transform represents the image in spatial and frequency domains, we
+carefully design a novel architecture SFUNet to effectively capture the
+correlation for both domains. Specifically, in the standard denoising U-Net for
+pixel data, we supplement the 2D convolutions and spatial-only attention layers
+with our spatial frequency-aware convolution and attention modules to jointly
+model the complementary information from spatial and frequency domains in
+wavelet data. Our new architecture can be used as a drop-in replacement to the
+pixel-based network and is compatible with the vanilla DDPM training process.
+By explicitly modeling the wavelet signals, we find our model is able to
+generate images with higher quality on CIFAR-10, FFHQ, LSUN-Bedroom, and
+LSUN-Church datasets, than the pixel-based counterpart.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MVMR-FS : Non-parametric feature selection algorithm based on Maximum
+  inter-class Variation and Minimum Redundancy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14643v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14643v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haitao Nie, Shengbo Zhang, Bin Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How to accurately measure the relevance and redundancy of features is an
+age-old challenge in the field of feature selection. However, existing
+filter-based feature selection methods cannot directly measure redundancy for
+continuous data. In addition, most methods rely on manually specifying the
+number of features, which may introduce errors in the absence of expert
+knowledge. In this paper, we propose a non-parametric feature selection
+algorithm based on maximum inter-class variation and minimum redundancy,
+abbreviated as MVMR-FS. We first introduce supervised and unsupervised kernel
+density estimation on the features to capture their similarities and
+differences in inter-class and overall distributions. Subsequently, we present
+the criteria for maximum inter-class variation and minimum redundancy (MVMR),
+wherein the inter-class probability distributions are employed to reflect
+feature relevance and the distances between overall probability distributions
+are used to quantify redundancy. Finally, we employ an AGA to search for the
+feature subset that minimizes the MVMR. Compared with ten state-of-the-art
+methods, MVMR-FS achieves the highest average accuracy and improves the
+accuracy by 5% to 11%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linear Convergence of Black-Box Variational Inference: Should We Stick
+  the Landing? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14642v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14642v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyurae Kim, Yian Ma, Jacob R. Gardner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We prove that black-box variational inference (BBVI) with control variates,
+particularly the sticking-the-landing (STL) estimator, converges at a geometric
+(traditionally called "linear") rate under perfect variational family
+specification. In particular, we prove a quadratic bound on the gradient
+variance of the STL estimator, one which encompasses misspecified variational
+families. Combined with previous works on the quadratic variance condition,
+this directly implies convergence of BBVI with the use of projected stochastic
+gradient descent. We also improve existing analysis on the regular closed-form
+entropy gradient estimators, which enables comparison against the STL estimator
+and provides explicit non-asymptotic complexity guarantees for both.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fact-Checking of AI-Generated Reports 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14634v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14634v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Razi Mahmood, Ge Wang, Mannudeep Kalra, Pingkun Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With advances in generative artificial intelligence (AI), it is now possible
+to produce realistic-looking automated reports for preliminary reads of
+radiology images. This can expedite clinical workflows, improve accuracy and
+reduce overall costs. However, it is also well-known that such models often
+hallucinate, leading to false findings in the generated reports. In this paper,
+we propose a new method of fact-checking of AI-generated reports using their
+associated images. Specifically, the developed examiner differentiates real and
+fake sentences in reports by learning the association between an image and
+sentences describing real or potentially fake findings. To train such an
+examiner, we first created a new dataset of fake reports by perturbing the
+findings in the original ground truth radiology reports associated with images.
+Text encodings of real and fake sentences drawn from these reports are then
+paired with image encodings to learn the mapping to real/fake labels. The
+utility of such an examiner is demonstrated for verifying automatically
+generated reports by detecting and removing fake sentences. Future generative
+AI approaches can use the resulting tool to validate their reports leading to a
+more responsible use of AI in expediting clinical workflows.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rapid and Scalable Bayesian AB Testing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14628v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14628v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Srivas Chennu, Andrew Maher, Christian Pangerl, Subash Prabanantham, Jae Hyeon Bae, Jamie Martin, Bud Goswami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AB testing aids business operators with their decision making, and is
+considered the gold standard method for learning from data to improve digital
+user experiences. However, there is usually a gap between the requirements of
+practitioners, and the constraints imposed by the statistical hypothesis
+testing methodologies commonly used for analysis of AB tests. These include the
+lack of statistical power in multivariate designs with many factors,
+correlations between these factors, the need of sequential testing for early
+stopping, and the inability to pool knowledge from past tests. Here, we propose
+a solution that applies hierarchical Bayesian estimation to address the above
+limitations. In comparison to current sequential AB testing methodology, we
+increase statistical power by exploiting correlations between factors, enabling
+sequential testing and progressive early stopping, without incurring excessive
+false positive risk. We also demonstrate how this methodology can be extended
+to enable the extraction of composite global learnings from past AB tests, to
+accelerate future tests. We underpin our work with a solid theoretical
+framework that articulates the value of hierarchical estimation. We demonstrate
+its utility using both numerical simulations and a large set of real-world AB
+tests. Together, these results highlight the practical value of our approach
+for statistical inference in the technology industry.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 10th IEEE International Conference On Data Science And Advanced
+  Analytics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BubbleML: A Multi-Physics <span class="highlight-title">Dataset</span> and Benchmarks for Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14623v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14623v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheikh Md Shakeel Hassan, Arthur Feeney, Akash Dhruv, Jihoon Kim, Youngjoon Suh, Jaiyoung Ryu, Yoonjin Won, Aparna Chandramowlishwaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of phase change phenomena, the lack of accessible and diverse
+datasets suitable for machine learning (ML) training poses a significant
+challenge. Existing experimental datasets are often restricted, with limited
+availability and sparse ground truth data, impeding our understanding of this
+complex multi-physics phenomena. To bridge this gap, we present the BubbleML
+Dataset(https://github.com/HPCForge/BubbleML) which leverages physics-driven
+simulations to provide accurate ground truth information for various boiling
+scenarios, encompassing nucleate pool boiling, flow boiling, and sub-cooled
+boiling. This extensive dataset covers a wide range of parameters, including
+varying gravity conditions, flow rates, sub-cooling levels, and wall superheat,
+comprising 51 simulations. BubbleML is validated against experimental
+observations and trends, establishing it as an invaluable resource for ML
+research. Furthermore, we showcase its potential to facilitate exploration of
+diverse downstream tasks by introducing two benchmarks: (a) optical flow
+analysis to capture bubble dynamics, and (b) operator networks for learning
+temperature dynamics. The BubbleML dataset and its benchmarks serve as a
+catalyst for advancements in ML-driven research on multi-physics phase change
+phenomena, enabling the development and comparison of state-of-the-art
+techniques and models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Neurips Datasets and Benchmarks Track 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Imitating Complex Trajectories: Bridging Low-Level Stability and
+  High-Level Behavior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14619v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14619v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam Block, Daniel Pfrommer, Max Simchowitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a theoretical framework for studying the imitation of stochastic,
+non-Markovian, potentially multi-modal (i.e. "complex" ) expert demonstrations
+in nonlinear dynamical systems. Our framework invokes low-level controllers -
+either learned or implicit in position-command control - to stabilize imitation
+policies around expert demonstrations. We show that with (a) a suitable
+low-level stability guarantee and (b) a stochastic continuity property of the
+learned policy we call "total variation continuity" (TVC), an imitator that
+accurately estimates actions on the demonstrator's state distribution closely
+matches the demonstrator's distribution over entire trajectories. We then show
+that TVC can be ensured with minimal degradation of accuracy by combining a
+popular data-augmentation regimen with a novel algorithmic trick: adding
+augmentation noise at execution time. We instantiate our guarantees for
+policies parameterized by diffusion models and prove that if the learner
+accurately estimates the score of the (noise-augmented) expert policy, then the
+distribution of imitator trajectories is close to the demonstrator distribution
+in a natural optimal transport distance. Our analysis constructs intricate
+couplings between noise-augmented trajectories, a technique that may be of
+independent interest. We conclude by empirically validating our algorithmic
+recommendations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>107 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Contrastive Graph Diffusion Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14613v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14613v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixian Ma, Kun Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Augmentation techniques and sampling strategies are crucial in contrastive
+learning, but in most existing works, augmentation techniques require careful
+design, and their sampling strategies can only capture a small amount of
+intrinsic supervision information. Additionally, the existing methods require
+complex designs to obtain two different representations of the data. To
+overcome these limitations, we propose a novel framework called the
+Self-Contrastive Graph Diffusion Network (SCGDN). Our framework consists of two
+main components: the Attentional Module (AttM) and the Diffusion Module (DiFM).
+AttM aggregates higher-order structure and feature information to get an
+excellent embedding, while DiFM balances the state of each node in the graph
+through Laplacian diffusion learning and allows the cooperative evolution of
+adjacency and feature information in the graph. Unlike existing methodologies,
+SCGDN is an augmentation-free approach that avoids "sampling bias" and semantic
+drift, without the need for pre-training. We conduct a high-quality sampling of
+samples based on structure and feature information. If two nodes are neighbors,
+they are considered positive samples of each other. If two disconnected nodes
+are also unrelated on $k$NN graph, they are considered negative samples for
+each other. The contrastive objective reasonably uses our proposed sampling
+strategies, and the redundancy reduction term minimizes redundant information
+in the embedding and can well retain more discriminative information. In this
+novel framework, the graph self-contrastive learning paradigm gives expression
+to a powerful force. SCGDN effectively balances between preserving high-order
+structure information and avoiding overfitting. The results manifest that SCGDN
+can consistently generate outperformance over both the contrastive methods and
+the classical methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Multimedia 2013 Accpeted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Complete and separate: Conditional separation with missing target source
+  attribute completion <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14609v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14609v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitrios Bralios, Efthymios Tzinis, Paris Smaragdis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent approaches in source separation leverage semantic information about
+their input mixtures and constituent sources that when used in conditional
+separation models can achieve impressive performance. Most approaches along
+these lines have focused on simple descriptions, which are not always useful
+for varying types of input mixtures. In this work, we present an approach in
+which a model, given an input mixture and partial semantic information about a
+target source, is trained to extract additional semantic data. We then leverage
+this pre-trained model to improve the separation performance of an uncoupled
+multi-conditional separation network. Our experiments demonstrate that the
+separation performance of this multi-conditional model is significantly
+improved, approaching the performance of an oracle model with complete semantic
+information. Furthermore, our approach achieves performance levels that are
+comparable to those of the best performing specialized single conditional
+models, thus providing an easier to use alternative.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Workshop on Applications of Signal Processing to
+  Audio and Acoustics (WASPAA) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HUTFormer: Hierarchical U-Net <span class="highlight-title">Transformer</span> for Long-Term Traffic
+  Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14596v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14596v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zezhi Shao, Fei Wang, Zhao Zhang, Yuchen Fang, Guangyin Jin, Yongjun Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic forecasting, which aims to predict traffic conditions based on
+historical observations, has been an enduring research topic and is widely
+recognized as an essential component of intelligent transportation. Recent
+proposals on Spatial-Temporal Graph Neural Networks (STGNNs) have made
+significant progress by combining sequential models with graph convolution
+networks. However, due to high complexity issues, STGNNs only focus on
+short-term traffic forecasting, e.g., 1-hour forecasting, while ignoring more
+practical long-term forecasting. In this paper, we make the first attempt to
+explore long-term traffic forecasting, e.g., 1-day forecasting. To this end, we
+first reveal its unique challenges in exploiting multi-scale representations.
+Then, we propose a novel Hierarchical U-net TransFormer (HUTFormer) to address
+the issues of long-term traffic forecasting. HUTFormer consists of a
+hierarchical encoder and decoder to jointly generate and utilize multi-scale
+representations of traffic data. Specifically, for the encoder, we propose
+window self-attention and segment merging to extract multi-scale
+representations from long-term traffic data. For the decoder, we design a
+cross-scale attention mechanism to effectively incorporate multi-scale
+representations. In addition, HUTFormer employs an efficient input embedding
+strategy to address the complexity issues. Extensive experiments on four
+traffic datasets show that the proposed HUTFormer significantly outperforms
+state-of-the-art traffic forecasting and long time series forecasting
+baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TKDE Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MCPA: Multi-scale Cross Perceptron Attention Network for 2D Medical
+  Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Xu, Mingxiao Chen, Yi Cheng, Pengfei Shao, Shuwei Shen, Peng Yao, Ronald X. Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The UNet architecture, based on Convolutional Neural Networks (CNN), has
+demonstrated its remarkable performance in medical image analysis. However, it
+faces challenges in capturing long-range dependencies due to the limited
+receptive fields and inherent bias of convolutional operations. Recently,
+numerous transformer-based techniques have been incorporated into the UNet
+architecture to overcome this limitation by effectively capturing global
+feature correlations. However, the integration of the Transformer modules may
+result in the loss of local contextual information during the global feature
+fusion process. To overcome these challenges, we propose a 2D medical image
+segmentation model called Multi-scale Cross Perceptron Attention Network
+(MCPA). The MCPA consists of three main components: an encoder, a decoder, and
+a Cross Perceptron. The Cross Perceptron first captures the local correlations
+using multiple Multi-scale Cross Perceptron modules, facilitating the fusion of
+features across scales. The resulting multi-scale feature vectors are then
+spatially unfolded, concatenated, and fed through a Global Perceptron module to
+model global dependencies. Furthermore, we introduce a Progressive Dual-branch
+Structure to address the semantic segmentation of the image involving finer
+tissue structures. This structure gradually shifts the segmentation focus of
+MCPA network training from large-scale structural features to more
+sophisticated pixel-level features. We evaluate our proposed MCPA model on
+several publicly available medical image datasets from different tasks and
+devices, including the open large-scale dataset of CT (Synapse), MRI (ACDC),
+fundus camera (DRIVE, CHASE_DB1, HRF), and OCTA (ROSE). The experimental
+results show that our MCPA model achieves state-of-the-art performance. The
+code is available at
+https://github.com/simonustc/MCPA-for-2D-Medical-Image-Segmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluation of Safety Constraints in Autonomous Navigation with Deep
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14568v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14568v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brian Angulo, Gregory Gorbov, Aleksandr Panov, Konstantin Yakovlev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While reinforcement learning algorithms have had great success in the field
+of autonomous navigation, they cannot be straightforwardly applied to the real
+autonomous systems without considering the safety constraints. The later are
+crucial to avoid unsafe behaviors of the autonomous vehicle on the road. To
+highlight the importance of these constraints, in this study, we compare two
+learnable navigation policies: safe and unsafe. The safe policy takes the
+constraints into account, while the other does not. We show that the safe
+policy is able to generate trajectories with more clearance (distance to the
+obstacles) and makes less collisions while training without sacrificing the
+overall performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Auto-Tables: Synthesizing Multi-Step Transformations to Relationalize
+  Tables without Using Examples <span class="chip">VLDB 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14565v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14565v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Li, Yeye He, Cong Yan, Yue Wang, Surajit Chauduri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Relational tables, where each row corresponds to an entity and each column
+corresponds to an attribute, have been the standard for tables in relational
+databases. However, such a standard cannot be taken for granted when dealing
+with tables "in the wild". Our survey of real spreadsheet-tables and web-tables
+shows that over 30% of such tables do not conform to the relational standard,
+for which complex table-restructuring transformations are needed before these
+tables can be queried easily using SQL-based analytics tools. Unfortunately,
+the required transformations are non-trivial to program, which has become a
+substantial pain point for technical and non-technical users alike, as
+evidenced by large numbers of forum questions in places like StackOverflow and
+Excel/Tableau forums.
+  We develop an Auto-Tables system that can automatically synthesize pipelines
+with multi-step transformations (in Python or other languages), to transform
+non-relational tables into standard relational forms for downstream analytics,
+obviating the need for users to manually program transformations. We compile an
+extensive benchmark for this new task, by collecting 194 real test cases from
+user spreadsheets and online forums. Our evaluation suggests that Auto-Tables
+can successfully synthesize transformations for over 70% of test cases at
+interactive speeds, without requiring any input from users, making this an
+effective tool for both technical and non-technical users to prepare data for
+analytics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>full version of a paper accepted to VLDB 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial Sleeping Bandit Problems with Multiple Plays: Algorithm and
+  Ranking Application <span class="chip">RecSys 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14549v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14549v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianjun Yuan, Wei Lee Woon, Ludovik Coba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an efficient algorithm to solve the sleeping bandit with
+multiple plays problem in the context of an online recommendation system. The
+problem involves bounded, adversarial loss and unknown i.i.d. distributions for
+arm availability. The proposed algorithm extends the sleeping bandit algorithm
+for single arm selection and is guaranteed to achieve theoretical performance
+with regret upper bounded by $\bigO(kN^2\sqrt{T\log T})$, where $k$ is the
+number of arms selected per time step, $N$ is the total number of arms, and $T$
+is the time horizon.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by RecSys 2023 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open Problems and Fundamental Limitations of Reinforcement Learning from
+  Human Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephen Casper, Xander Davies, Claudia Shi, Thomas Krendl Gilbert, Jérémy Scheurer, Javier Rando, Rachel Freedman, Tomasz Korbak, David Lindner, Pedro Freire, Tony Wang, Samuel Marks, Charbel-Raphaël Segerie, Micah Carroll, Andi Peng, Phillip Christoffersen, Mehul Damani, Stewart Slocum, Usman Anwar, Anand Siththaranjan, Max Nadeau, Eric J. Michaud, Jacob Pfau, Dmitrii Krasheninnikov, Xin Chen, Lauro Langosco, Peter Hase, Erdem Bıyık, Anca Dragan, David Krueger, Dorsa Sadigh, Dylan Hadfield-Menell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning from human feedback (RLHF) is a technique for training
+AI systems to align with human goals. RLHF has emerged as the central method
+used to finetune state-of-the-art large language models (LLMs). Despite this
+popularity, there has been relatively little public work systematizing its
+flaws. In this paper, we (1) survey open problems and fundamental limitations
+of RLHF and related methods; (2) overview techniques to understand, improve,
+and complement RLHF in practice; and (3) propose auditing and disclosure
+standards to improve societal oversight of RLHF systems. Our work emphasizes
+the limitations of RLHF and highlights the importance of a multi-faceted
+approach to the development of safer AI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>Styler: <span class="highlight-title">Prompt</span>-driven Style Generation for Source-free Domain
+  Generalization <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhyeong Cho, Gilhyun Nam, Sungyeon Kim, Hunmin Yang, Suha Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a joint vision-language space, a text feature (e.g., from "a photo of a
+dog") could effectively represent its relevant image features (e.g., from dog
+photos). Inspired by this, we propose PromptStyler which simulates various
+distribution shifts in the joint space by synthesizing diverse styles via
+prompts without using any images to deal with source-free domain
+generalization. Our method learns to generate a variety of style features (from
+"a S* style of a") via learnable style word vectors for pseudo-words S*. To
+ensure that learned styles do not distort content information, we force
+style-content features (from "a S* style of a [class]") to be located nearby
+their corresponding content features (from "[class]") in the joint
+vision-language space. After learning style word vectors, we train a linear
+classifier using synthesized style-content features. PromptStyler achieves the
+state of the art on PACS, VLCS, OfficeHome and DomainNet, although it does not
+require any images and takes just ~30 minutes for training using a single GPU.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023, Project Page: https://promptstyler.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Marginal Value of Momentum for Small Learning Rate SGD 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15196v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15196v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runzhe Wang, Sadhika Malladi, Tianhao Wang, Kaifeng Lyu, Zhiyuan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Momentum is known to accelerate the convergence of gradient descent in
+strongly convex settings without stochastic gradient noise. In stochastic
+optimization, such as training neural networks, folklore suggests that momentum
+may help deep learning optimization by reducing the variance of the stochastic
+gradient update, but previous theoretical analyses do not find momentum to
+offer any provable acceleration. Theoretical results in this paper clarify the
+role of momentum in stochastic settings where the learning rate is small and
+gradient noise is the dominant source of instability, suggesting that SGD with
+and without momentum behave similarly in the short and long time horizons.
+Experiments show that momentum indeed has limited benefits for both
+optimization and generalization in practical training regimes where the optimal
+learning rate is not very large, including small- to medium-batch training from
+scratch on ImageNet and fine-tuning language models on downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning in Repeated Multi-Unit Pay-As-Bid Auctions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15193v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15193v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rigel Galgana, Negin Golrezaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by Carbon Emissions Trading Schemes, Treasury Auctions, and
+Procurement Auctions, which all involve the auctioning of homogeneous multiple
+units, we consider the problem of learning how to bid in repeated multi-unit
+pay-as-bid auctions. In each of these auctions, a large number of (identical)
+items are to be allocated to the largest submitted bids, where the price of
+each of the winning bids is equal to the bid itself. The problem of learning
+how to bid in pay-as-bid auctions is challenging due to the combinatorial
+nature of the action space. We overcome this challenge by focusing on the
+offline setting, where the bidder optimizes their vector of bids while only
+having access to the past submitted bids by other bidders. We show that the
+optimal solution to the offline problem can be obtained using a polynomial time
+dynamic programming (DP) scheme. We leverage the structure of the DP scheme to
+design online learning algorithms with polynomial time and space complexity
+under full information and bandit feedback settings. We achieve an upper bound
+on regret of $O(M\sqrt{T\log |\mathcal{B}|})$ and $O(M\sqrt{|\mathcal{B}|T\log
+|\mathcal{B}|})$ respectively, where $M$ is the number of units demanded by the
+bidder, $T$ is the total number of auctions, and $|\mathcal{B}|$ is the size of
+the discretized bid space. We accompany these results with a regret lower
+bound, which match the linear dependency in $M$. Our numerical results suggest
+that when all agents behave according to our proposed no regret learning
+algorithms, the resulting market dynamics mainly converge to a welfare
+maximizing equilibrium where bidders submit uniform bids. Lastly, our
+experiments demonstrate that the pay-as-bid auction consistently generates
+significantly higher revenue compared to its popular alternative, the uniform
+price auction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>51 pages, 12 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ f-Divergence Minimization for Sequence-Level Knowledge Distillation <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15190v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15190v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqiao Wen, Zichao Li, Wenyu Du, Lili Mou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation (KD) is the process of transferring knowledge from a
+large model to a small one. It has gained increasing attention in the natural
+language processing community, driven by the demands of compressing
+ever-growing language models. In this work, we propose an f-DISTILL framework,
+which formulates sequence-level knowledge distillation as minimizing a
+generalized f-divergence function. We propose four distilling variants under
+our framework and show that existing SeqKD and ENGINE approaches are
+approximations of our f-DISTILL methods. We further derive step-wise
+decomposition for our f-DISTILL, reducing intractable sequence-level divergence
+to word-level losses that can be computed in a tractable manner. Experiments
+across four datasets show that our methods outperform existing KD approaches,
+and that our symmetric distilling losses can better force the student to learn
+from the teacher distribution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RCT Rejection Sampling for Causal Estimation Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15176v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15176v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katherine A. Keith, Sergey Feldman, David Jurgens, Jonathan Bragg, Rohit Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Confounding is a significant obstacle to unbiased estimation of causal
+effects from observational data. For settings with high-dimensional covariates
+-- such as text data, genomics, or the behavioral social sciences --
+researchers have proposed methods to adjust for confounding by adapting machine
+learning methods to the goal of causal estimation. However, empirical
+evaluation of these adjustment methods has been challenging and limited. In
+this work, we build on a promising empirical evaluation strategy that
+simplifies evaluation design and uses real data: subsampling randomized
+controlled trials (RCTs) to create confounded observational datasets while
+using the average causal effects from the RCTs as ground-truth. We contribute a
+new sampling algorithm, which we call RCT rejection sampling, and provide
+theoretical guarantees that causal identification holds in the observational
+data to allow for valid comparisons to the ground-truth RCT. Using synthetic
+data, we show our algorithm indeed results in low bias when oracle estimators
+are evaluated on the confounded samples, which is not always the case for a
+previously proposed algorithm. In addition to this identification result, we
+highlight several finite data considerations for evaluation designers who plan
+to use RCT rejection sampling on their own datasets. As a proof of concept, we
+implement an example evaluation pipeline and walk through these finite data
+considerations with a novel, real-world RCT -- which we release publicly --
+consisting of approximately 70k observations and text data as high-dimensional
+covariates. Together, these contributions build towards a broader agenda of
+improved empirical evaluation for causal estimation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and data at https://github.com/kakeith/rct_rejection_sampling</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causative Cyberattacks on Online Learning-based Automated Demand
+  Response Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samrat Acharya, Yury Dvorkin, Ramesh Karri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Power utilities are adopting Automated Demand Response (ADR) to replace the
+costly fuel-fired generators and to preempt congestion during peak electricity
+demand. Similarly, third-party Demand Response (DR) aggregators are leveraging
+controllable small-scale electrical loads to provide on-demand grid support
+services to the utilities. Some aggregators and utilities have started
+employing Artificial Intelligence (AI) to learn the energy usage patterns of
+electricity consumers and use this knowledge to design optimal DR incentives.
+Such AI frameworks use open communication channels between the
+utility/aggregator and the DR customers, which are vulnerable to
+\textit{causative} data integrity cyberattacks. This paper explores
+vulnerabilities of AI-based DR learning and designs a data-driven attack
+strategy informed by DR data collected from the New York University (NYU)
+campus buildings. The case study demonstrates the feasibility and effects of
+maliciously tampering with (i) real-time DR incentives, (ii) DR event data sent
+to DR customers, and (iii) responses of DR customers to the DR incentives.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PredictChain: Empowering Collaboration and Data Accessibility for AI in
+  a Decentralized Blockchain-based Marketplace 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew T. Pisano, Connor J. Patterson, Oshani Seneviratne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Limited access to computing resources and training data poses significant
+challenges for individuals and groups aiming to train and utilize predictive
+machine learning models. Although numerous publicly available machine learning
+models exist, they are often unhosted, necessitating end-users to establish
+their computational infrastructure. Alternatively, these models may only be
+accessible through paid cloud-based mechanisms, which can prove costly for
+general public utilization. Moreover, model and data providers require a more
+streamlined approach to track resource usage and capitalize on subsequent usage
+by others, both financially and otherwise. An effective mechanism is also
+lacking to contribute high-quality data for improving model performance. We
+propose a blockchain-based marketplace called "PredictChain" for predictive
+machine-learning models to address these issues. This marketplace enables users
+to upload datasets for training predictive machine learning models, request
+model training on previously uploaded datasets, or submit queries to trained
+models. Nodes within the blockchain network, equipped with available computing
+resources, will operate these models, offering a range of archetype machine
+learning models with varying characteristics, such as cost, speed, simplicity,
+power, and cost-effectiveness. This decentralized approach empowers users to
+develop improved models accessible to the public, promotes data sharing, and
+reduces reliance on centralized cloud providers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VISU at WASSA 2023 Shared Task: Detecting Emotions in Reaction to News
+  Stories Leveraging <span class="highlight-title">BERT</span> and Stacked Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15164v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15164v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivek Kumar, Sushmita Singh, Prayag Tiwari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our system, VISU, participated in the WASSA 2023 Shared Task (3) of Emotion
+Classification from essays written in reaction to news articles. Emotion
+detection from complex dialogues is challenging and often requires
+context/domain understanding. Therefore in this research, we have focused on
+developing deep learning (DL) models using the combination of word embedding
+representations with tailored prepossessing strategies to capture the nuances
+of emotions expressed. Our experiments used static and contextual embeddings
+(individual and stacked) with Bidirectional Long short-term memory (BiLSTM) and
+Transformer based models. We occupied rank tenth in the emotion detection task
+by scoring a Macro F1-Score of 0.2717, validating the efficacy of our
+implemented approaches for small and imbalanced datasets with mixed categories
+of target emotions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ R-LPIPS: An Adversarially Robust Perceptual Similarity Metric 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15157v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15157v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Ghazanfari, Siddharth Garg, Prashanth Krishnamurthy, Farshad Khorrami, Alexandre Araujo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Similarity metrics have played a significant role in computer vision to
+capture the underlying semantics of images. In recent years, advanced
+similarity metrics, such as the Learned Perceptual Image Patch Similarity
+(LPIPS), have emerged. These metrics leverage deep features extracted from
+trained neural networks and have demonstrated a remarkable ability to closely
+align with human perception when evaluating relative image similarity. However,
+it is now well-known that neural networks are susceptible to adversarial
+examples, i.e., small perturbations invisible to humans crafted to deliberately
+mislead the model. Consequently, the LPIPS metric is also sensitive to such
+adversarial examples. This susceptibility introduces significant security
+concerns, especially considering the widespread adoption of LPIPS in
+large-scale applications. In this paper, we propose the Robust Learned
+Perceptual Image Patch Similarity (R-LPIPS) metric, a new metric that leverages
+adversarially trained deep features. Through a comprehensive set of
+experiments, we demonstrate the superiority of R-LPIPS compared to the
+classical LPIPS metric. The code is available at
+\url{https://github.com/SaraGhazanfari/R-LPIPS}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A/B Testing and Best-arm Identification for Linear Bandits with
+  Robustness to Non-stationarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15154v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15154v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihan Xiong, Romain Camilleri, Maryam Fazel, Lalit Jain, Kevin Jamieson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the fixed-budget best-arm identification (BAI) problem for
+linear bandits in a potentially non-stationary environment. Given a finite arm
+set $\mathcal{X}\subset\mathbb{R}^d$, a fixed budget $T$, and an unpredictable
+sequence of parameters $\left\lbrace\theta_t\right\rbrace_{t=1}^{T}$, an
+algorithm will aim to correctly identify the best arm $x^* :=
+\arg\max_{x\in\mathcal{X}}x^\top\sum_{t=1}^{T}\theta_t$ with probability as
+high as possible. Prior work has addressed the stationary setting where
+$\theta_t = \theta_1$ for all $t$ and demonstrated that the error probability
+decreases as $\exp(-T /\rho^*)$ for a problem-dependent constant $\rho^*$. But
+in many real-world $A/B/n$ multivariate testing scenarios that motivate our
+work, the environment is non-stationary and an algorithm expecting a stationary
+setting can easily fail. For robust identification, it is well-known that if
+arms are chosen randomly and non-adaptively from a G-optimal design over
+$\mathcal{X}$ at each time then the error probability decreases as
+$\exp(-T\Delta^2_{(1)}/d)$, where $\Delta_{(1)} = \min_{x \neq x^*} (x^* -
+x)^\top \frac{1}{T}\sum_{t=1}^T \theta_t$. As there exist environments where
+$\Delta_{(1)}^2/ d \ll 1/ \rho^*$, we are motivated to propose a novel
+algorithm $\mathsf{P1}$-$\mathsf{RAGE}$ that aims to obtain the best of both
+worlds: robustness to non-stationarity and fast rates of identification in
+benign settings. We characterize the error probability of
+$\mathsf{P1}$-$\mathsf{RAGE}$ and demonstrate empirically that the algorithm
+indeed never performs worse than G-optimal design but compares favorably to the
+best algorithms in the stationary setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ R-Block: Regularized Block of Dropout for convolutional networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15150v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15150v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liqi Wang, Qiya Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dropout as a regularization technique is widely used in fully connected
+layers while is less effective in convolutional layers. Therefore more
+structured forms of dropout have been proposed to regularize convolutional
+networks. The disadvantage of these methods is that the randomness introduced
+causes inconsistency between training and inference. In this paper, we apply a
+mutual learning training strategy for convolutional layer regularization,
+namely R-Block, which forces two outputs of the generated difference maximizing
+sub models to be consistent with each other. Concretely, R-Block minimizes the
+losses between the output distributions of two sub models with different drop
+regions for each sample in the training dataset. We design two approaches to
+construct such sub models. Our experiments demonstrate that R-Block achieves
+better performance than other existing structured dropout variants. We also
+demonstrate that our approaches to construct sub models outperforms others.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Morphing Attacks via Continual Incremental Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15105v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15105v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Pellegrini, Guido Borghi, Annalisa Franco, Davide Maltoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scenarios in which restrictions in data transfer and storage limit the
+possibility to compose a single dataset -- also exploiting different data
+sources -- to perform a batch-based training procedure, make the development of
+robust models particularly challenging. We hypothesize that the recent
+Continual Learning (CL) paradigm may represent an effective solution to enable
+incremental training, even through multiple sites. Indeed, a basic assumption
+of CL is that once a model has been trained, old data can no longer be used in
+successive training iterations and in principle can be deleted. Therefore, in
+this paper, we investigate the performance of different Continual Learning
+methods in this scenario, simulating a learning model that is updated every
+time a new chunk of data, even of variable size, is available. Experimental
+results reveal that a particular CL method, namely Learning without Forgetting
+(LwF), is one of the best-performing algorithms. Then, we investigate its usage
+and parametrization in Morphing Attack Detection and Object Classification
+tasks, specifically with respect to the amount of new training data that became
+available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted in IJCB 2023 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detection of Children Abuse by Voice and Audio Classification by
+  Short-Time Fourier Transform Machine Learning implemented on Nvidia Edge GPU
+  device 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15101v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15101v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiuqi Yan, Yingxian Chen, W. W. T. Fok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The safety of children in children home has become an increasing social
+concern, and the purpose of this experiment is to use machine learning applied
+to detect the scenarios of child abuse to increase the safety of children. This
+experiment uses machine learning to classify and recognize a child's voice and
+predict whether the current sound made by the child is crying, screaming or
+laughing. If a child is found to be crying or screaming, an alert is
+immediately sent to the relevant personnel so that they can perceive what the
+child may be experiencing in a surveillance blind spot and respond in a timely
+manner. Together with a hybrid use of video image classification, the accuracy
+of child abuse detection can be significantly increased. This greatly reduces
+the likelihood that a child will receive violent abuse in the nursery and
+allows personnel to stop an imminent or incipient child abuse incident in time.
+The datasets collected from this experiment is entirely from sounds recorded on
+site at the children home, including crying, laughing, screaming sound and
+background noises. These sound files are transformed into spectrograms using
+Short-Time Fourier Transform, and then these image data are imported into a CNN
+neural network for classification, and the final trained model can achieve an
+accuracy of about 92% for sound detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 7 figures, PRAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Clustering of illustrations by atmosphere using a combination of
+  supervised and unsupervised learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15099v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15099v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keisuke Kubota, Masahiro Okuda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The distribution of illustrations on social media, such as Twitter and Pixiv
+has increased with the growing popularity of animation, games, and animated
+movies. The "atmosphere" of illustrations plays an important role in user
+preferences. Classifying illustrations by atmosphere can be helpful for
+recommendations and searches. However, assigning clear labels to the elusive
+"atmosphere" and conventional supervised classification is not always
+practical. Furthermore, even images with similar colors, edges, and low-level
+features may not have similar atmospheres, making classification based on
+low-level features challenging. In this paper, this problem is solved using
+both supervised and unsupervised learning with pseudo-labels. The feature
+vectors are obtained using the supervised method with pseudo-labels that
+contribute to an ambiguous atmosphere. Further, clustering is performed based
+on these feature vectors. Experimental analyses show that our method
+outperforms conventional methods in human-like clustering on datasets manually
+classified by humans.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Algorithmic Gaussianization through Sketching: Converting Data into
+  Sub-gaussian Random Designs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.10291v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.10291v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michał Dereziński
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Algorithmic Gaussianization is a phenomenon that can arise when using
+randomized sketching or sampling methods to produce smaller representations of
+large datasets: For certain tasks, these sketched representations have been
+observed to exhibit many robust performance characteristics that are known to
+occur when a data sample comes from a sub-gaussian random design, which is a
+powerful statistical model of data distributions. However, this phenomenon has
+only been studied for specific tasks and metrics, or by relying on
+computationally expensive methods. We address this by providing an algorithmic
+framework for gaussianizing data distributions via averaging, proving that it
+is possible to efficiently construct data sketches that are nearly
+indistinguishable (in terms of total variation distance) from sub-gaussian
+random designs. In particular, relying on a recently introduced sketching
+technique called Leverage Score Sparsified (LESS) embeddings, we show that one
+can construct an $n\times d$ sketch of an $N\times d$ matrix $A$, where $n\ll
+N$, that is nearly indistinguishable from a sub-gaussian design, in time
+$O(\text{nnz}(A)\log N + nd^2)$, where $\text{nnz}(A)$ is the number of
+non-zero entries in $A$. As a consequence, strong statistical guarantees and
+precise asymptotics available for the estimators produced from sub-gaussian
+designs (e.g., for least squares and Lasso regression, covariance estimation,
+low-rank approximation, etc.) can be straightforwardly adapted to our sketching
+framework. We illustrate this with a new approximation guarantee for sketched
+least squares, among other examples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Synergies Between Federated Learning and O-RAN: Towards an Elastic
+  Virtualized Architecture for Multiple Distributed Machine Learning Services 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02109v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02109v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Payam Abdisarabshali, Nicholas Accurso, Filippo Malandra, Weifeng Su, Seyyedali Hosseinalipour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is the most popular distributed machine learning
+technique. However, implementation of FL over modern wireless networks faces
+key challenges caused by (i) dynamics of the network conditions and (ii) the
+coexistence of multiple FL services/tasks and other network services in the
+system, which are not jointly considered in prior works. Motivated by these
+challenges, we introduce a generic FL paradigm over NextG networks, called
+dynamic multi-service FL (DMS-FL). We identify three unexplored design
+considerations in DMS-FL: (i) FL service operator accumulation, (ii) wireless
+resource fragmentation, and (iii) signal strength fluctuations. We take the
+first steps towards addressing these design considerations by proposing a novel
+distributed ML architecture called elastic virtualized FL (EV-FL). EV-FL
+unleashes the full potential of Open RAN (O-RAN) systems and introduces an
+elastic resource provisioning methodology to execute FL services. It further
+constitutes a multi-time-scale FL management system that introduces three
+dimensions into existing FL architectures: (i) virtualization, (ii)
+scalability, and (iii) elasticity. Through investigating EV-FL, we reveal a
+series of open research directions for future work. We finally simulate EV-FL
+to demonstrate its potential in saving wireless resources and increasing
+fairness among FL services.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Formulation Graphs for Mapping Structure-Composition of Battery
+  Electrolytes to Device Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03811v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03811v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vidushi Sharma, Maxwell Giammona, Dmitry Zubarev, Andy Tek, Khanh Nugyuen, Linda Sundberg, Daniele Congiu, Young-Hye La
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advanced computational methods are being actively sought for addressing the
+challenges associated with discovery and development of new combinatorial
+material such as formulations. A widely adopted approach involves domain
+informed high-throughput screening of individual components that can be
+combined into a formulation. This manages to accelerate the discovery of new
+compounds for a target application but still leave the process of identifying
+the right 'formulation' from the shortlisted chemical space largely a
+laboratory experiment-driven process. We report a deep learning model,
+Formulation Graph Convolution Network (F-GCN), that can map
+structure-composition relationship of the individual components to the property
+of liquid formulation as whole. Multiple GCNs are assembled in parallel that
+featurize formulation constituents domain-intuitively on the fly. The resulting
+molecular descriptors are scaled based on respective constituent's molar
+percentage in the formulation, followed by formalizing into a combined
+descriptor that represents a complete formulation to an external learning
+architecture. The use case of proposed formulation learning model is
+demonstrated for battery electrolytes by training and testing it on two
+exemplary datasets representing electrolyte formulations vs battery performance
+-- one dataset is sourced from literature about Li/Cu half-cells, while the
+other is obtained by lab-experiments related to lithium-iodide full-cell
+chemistry. The model is shown to predict the performance metrics like Coulombic
+Efficiency (CE) and specific capacity of new electrolyte formulations with
+lowest reported errors. The best performing F-GCN model uses molecular
+descriptors derived from molecular graphs that are informed with HOMO-LUMO and
+electric moment properties of the molecules using a knowledge transfer
+technique.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamics of specialization in neural modules under resource constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.02626v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.02626v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Béna, Dan F. M. Goodman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has long been believed that the brain is highly modular both in terms of
+structure and function, although recent evidence has led some to question the
+extent of both types of modularity. We used artificial neural networks to test
+the hypothesis that structural modularity is sufficient to guarantee functional
+specialization, and find that in general, this doesn't necessarily hold except
+at extreme levels. We then systematically tested which features of the
+environment and network do lead to the emergence of specialization. We used a
+simple toy environment, task and network, allowing us precise control, and show
+that in this setup, several distinct measures of specialization give
+qualitatively similar results. We further find that (1) specialization can only
+emerge in environments where features of that environment are meaningfully
+separable, (2) specialization preferentially emerges when the network is
+strongly resource-constrained, and (3) these findings are qualitatively similar
+across different network architectures, but the quantitative relationships
+depends on the architecture type. Finally, we show that functional
+specialization varies dynamically across time, and demonstrate that these
+dynamics depend on both the timing and bandwidth of information flow in the
+network. We conclude that a static notion of specialization, based on
+structural modularity, is likely too simple a framework for understanding
+intelligent systems in situations of real-world complexity. We propose that
+thoroughly stress testing candidate definitions of functional modularity in
+simplified scenarios before extending to more complex data, network models and
+electrophysiological recordings is likely to be a fruitful approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How to Scale Your EMA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13813v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13813v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dan Busbridge, Jason Ramapuram, Pierre Ablin, Tatiana Likhomanenko, Eeshan Gunesh Dhekane, Xavier Suau, Russ Webb
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Preserving training dynamics across batch sizes is an important tool for
+practical machine learning as it enables the trade-off between batch size and
+wall-clock time. This trade-off is typically enabled by a scaling rule, for
+example, in stochastic gradient descent, one should scale the learning rate
+linearly with the batch size. Another important tool for practical machine
+learning is the model Exponential Moving Average (EMA), which is a model copy
+that does not receive gradient information, but instead follows its target
+model with some momentum. This model EMA can improve the robustness and
+generalization properties of supervised learning, stabilize pseudo-labeling,
+and provide a learning signal for Self-Supervised Learning (SSL). Prior works
+have treated the model EMA separately from optimization, leading to different
+training dynamics across batch sizes and lower model performance. In this work,
+we provide a scaling rule for optimization in the presence of model EMAs and
+demonstrate its validity across a range of architectures, optimizers, and data
+modalities. We also show the rule's validity where the model EMA contributes to
+the optimization of the target model, enabling us to train EMA-based
+pseudo-labeling and SSL methods at small and large batch sizes. For SSL, we
+enable training of BYOL up to batch size 24,576 without sacrificing
+performance, optimally a 6$\times$ wall-clock time reduction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>51 pages, 28 figures, 15 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RELDEC: Reinforcement Learning-Based Decoding of Moderate Length LDPC
+  Codes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.13934v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.13934v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Salman Habib, Allison Beemer, Joerg Kliewer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we propose RELDEC, a novel approach for sequential decoding of
+moderate length low-density parity-check (LDPC) codes. The main idea behind
+RELDEC is that an optimized decoding policy is subsequently obtained via
+reinforcement learning based on a Markov decision process (MDP). In contrast to
+our previous work, where an agent learns to schedule only a single check node
+(CN) within a group (cluster) of CNs per iteration, in this work we train the
+agent to schedule all CNs in a cluster, and all clusters in every iteration.
+That is, in each learning step of RELDEC an agent learns to schedule CN
+clusters sequentially depending on a reward associated with the outcome of
+scheduling a particular cluster. We also modify the state space representation
+of the MDP, enabling RELDEC to be suitable for larger block length LDPC codes
+than those studied in our previous work. Furthermore, to address decoding under
+varying channel conditions, we propose agile meta-RELDEC (AM-RELDEC) that
+employs meta-reinforcement learning. The proposed RELDEC scheme significantly
+outperforms standard flooding and random sequential decoding for a variety of
+LDPC codes, including codes designed for 5G new radio.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in IEEE Transactions on Communications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trace Recovery from Stochastically Known Logs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.12672v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.12672v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eli Bogdanov, Izack Cohen, Avigdor Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we propose an algorithm for trace recovery from stochastically
+known logs, a setting that is becoming more common with the increasing number
+of sensors and predictive models that generate uncertain data. The suggested
+approach calculates the conformance between a process model and a
+stochastically known trace and recovers the best alignment within this
+stochastic trace as the true trace. The paper offers an analysis of the impact
+of various cost models on trace recovery accuracy and makes use of a product
+multi-graph to compare alternative trace recovery options. The average accuracy
+of our approach, evaluated using two publicly available datasets, is
+impressive, with an average recovery accuracy score of 90-97%, significantly
+improving a common heuristic that chooses the most likely value for each
+uncertain activity. We believe that the effectiveness of the proposed algorithm
+in recovering correct traces from stochastically known logs may be a powerful
+aid for developing credible decision-making tools in uncertain settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted version -- Accepted to the 5th International Conference on
+  Process Mining (ICPM), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Alternating Minimization with Applications to Weighted Low
+  Rank Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04169v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04169v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhao Song, Mingquan Ye, Junze Yin, Lichen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weighted low rank approximation is a fundamental problem in numerical linear
+algebra, and it has many applications in machine learning. Given a matrix $M
+\in \mathbb{R}^{n \times n}$, a weight matrix $W \in \mathbb{R}_{\geq 0}^{n
+\times n}$, a parameter $k$, the goal is to output two matrices $U, V \in
+\mathbb{R}^{n \times k}$ such that $\| W \circ (M - U V^\top) \|_F$ is
+minimized, where $\circ$ denotes the Hadamard product. Such a problem is known
+to be NP-hard and even hard to approximate assuming Exponential Time Hypothesis
+[GG11, RSW16]. Meanwhile, alternating minimization is a good heuristic solution
+for approximating weighted low rank approximation. The work [LLR16] shows that,
+under mild assumptions, alternating minimization does provide provable
+guarantees. In this work, we develop an efficient and robust framework for
+alternating minimization. For weighted low rank approximation, this improves
+the runtime of [LLR16] from $n^2 k^2$ to $n^2k$. At the heart of our work
+framework is a high-accuracy multiple response regression solver together with
+a robust analysis of alternating minimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Causal Lifting and Link Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01198v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01198v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonardo Cotta, Beatrice Bevilacqua, Nesreen Ahmed, Bruno Ribeiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing causal models for link prediction assume an underlying set of
+inherent node factors -- an innate characteristic defined at the node's birth
+-- that governs the causal evolution of links in the graph. In some causal
+tasks, however, link formation is path-dependent: The outcome of link
+interventions depends on existing links. Unfortunately, these existing causal
+methods are not designed for path-dependent link formation, as the cascading
+functional dependencies between links (arising from path dependence) are either
+unidentifiable or require an impractical number of control variables. To
+overcome this, we develop the first causal model capable of dealing with path
+dependencies in link prediction. In this work we introduce the concept of
+causal lifting, an invariance in causal models of independent interest that, on
+graphs, allows the identification of causal link prediction queries using
+limited interventional data. Further, we show how structural pairwise
+embeddings exhibit lower bias and correctly represent the task's causal
+structure, as opposed to existing node embeddings, e.g., graph neural network
+node embeddings and matrix factorization. Finally, we validate our theoretical
+findings on three scenarios for causal link prediction tasks: knowledge base
+completion, covariance matrix estimation and consumer-product recommendations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gaussian Latent Representations for Uncertainty Estimation using
+  Mahalanobis Distance in Deep Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13849v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13849v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aishwarya Venkataramanan, Assia Benbihi, Martin Laviale, Cedric Pradalier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works show that the data distribution in a network's latent space is
+useful for estimating classification uncertainty and detecting
+Out-of-distribution (OOD) samples. To obtain a well-regularized latent space
+that is conducive for uncertainty estimation, existing methods bring in
+significant changes to model architectures and training procedures. In this
+paper, we present a lightweight, fast, and high-performance regularization
+method for Mahalanobis distance-based uncertainty prediction, and that requires
+minimal changes to the network's architecture. To derive Gaussian latent
+representation favourable for Mahalanobis Distance calculation, we introduce a
+self-supervised representation learning method that separates in-class
+representations into multiple Gaussians. Classes with non-Gaussian
+representations are automatically identified and dynamically clustered into
+multiple new classes that are approximately Gaussian. Evaluation on standard
+OOD benchmarks shows that our method achieves state-of-the-art results on OOD
+detection with minimal inference time, and is very competitive on predictive
+probability calibration. Finally, we show the applicability of our method to a
+real-life computer vision use case on microorganism classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages including supplementary material</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Contextual Data to Newsvendor Decisions: On the Actual Performance
+  of Data-Driven Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08424v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08424v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omar Besbes, Will Ma, Omar Mouchtaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we explore a framework for contextual decision-making to study
+how the relevance and quantity of past data affects the performance of a
+data-driven policy. We analyze a contextual Newsvendor problem in which a
+decision-maker needs to trade-off between an underage and an overage cost in
+the face of uncertain demand. We consider a setting in which past demands
+observed under ``close by'' contexts come from close by distributions and
+analyze the performance of data-driven algorithms through a notion of
+context-dependent worst-case expected regret. We analyze the broad class of
+Weighted Empirical Risk Minimization (WERM) policies which weigh past data
+according to their similarity in the contextual space. This class includes
+classical policies such as ERM, k-Nearest Neighbors and kernel-based policies.
+Our main methodological contribution is to characterize exactly the worst-case
+regret of any WERM policy on any given configuration of contexts. To the best
+of our knowledge, this provides the first understanding of tight performance
+guarantees in any contextual decision-making problem, with past literature
+focusing on upper bounds via concentration inequalities. We instead take an
+optimization approach, and isolate a structure in the Newsvendor loss function
+that allows to reduce the infinite-dimensional optimization problem over
+worst-case distributions to a simple line search.
+  This in turn allows us to unveil fundamental insights that were obfuscated by
+previous general-purpose bounds. We characterize actual guaranteed performance
+as a function of the contexts, as well as granular insights on the learning
+curve of algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Transfer Operators by Kernel Density Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.03124v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.03124v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sudam Surasinghe, Jeremie Fish, Erik M. Bollt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inference of transfer operators from data is often formulated as a classical
+problem that hinges on the Ulam method. The conventional description, known as
+the Ulam-Galerkin method, involves projecting onto basis functions represented
+as characteristic functions supported over a fine grid of rectangles. From this
+perspective, the Ulam-Galerkin approach can be interpreted as density
+estimation using the histogram method. In this study, we recast the problem
+within the framework of statistical density estimation. This alternative
+perspective allows for an explicit and rigorous analysis of bias and variance,
+thereby facilitating a discussion on the mean square error. Through
+comprehensive examples utilizing the logistic map and a Markov map, we
+demonstrate the validity and effectiveness of this approach in estimating the
+eigenvectors of the Frobenius-Perron operator. We compare the performance of
+Histogram Density Estimation(HDE) and Kernel Density Estimation(KDE) methods
+and find that KDE generally outperforms HDE in terms of accuracy. However, it
+is important to note that KDE exhibits limitations around boundary points and
+jumps. Based on our research findings, we suggest the possibility of
+incorporating other density estimation methods into this field and propose
+future investigations into the application of KDE-based estimation for
+high-dimensional maps. These findings provide valuable insights for researchers
+and practitioners working on estimating the Frobenius-Perron operator and
+highlight the potential of density estimation techniques in this area of study.
+  Keywords: Transfer Operators; Frobenius-Perron operator; probability density
+estimation; Ulam-Galerkin method; Kernel Density Estimation; Histogram Density
+Estimation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient and Feasible Robotic Assembly Sequence Planning via Graph
+  Representation Learning <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10135v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10135v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matan Atad, Jianxiang Feng, Ismael Rodríguez, Maximilian Durner, Rudolph Triebel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic Robotic Assembly Sequence Planning (RASP) can significantly improve
+productivity and resilience in modern manufacturing along with the growing need
+for greater product customization. One of the main challenges in realizing such
+automation resides in efficiently finding solutions from a growing number of
+potential sequences for increasingly complex assemblies. Besides, costly
+feasibility checks are always required for the robotic system. To address this,
+we propose a holistic graphical approach including a graph representation
+called Assembly Graph for product assemblies and a policy architecture, Graph
+Assembly Processing Network, dubbed GRACE for assembly sequence generation.
+With GRACE, we are able to extract meaningful information from the graph input
+and predict assembly sequences in a step-by-step manner. In experiments, we
+show that our approach can predict feasible assembly sequences across product
+variants of aluminum profiles based on data collected in simulation of a
+dual-armed robotic system. We further demonstrate that our method is capable of
+detecting infeasible assemblies, substantially alleviating the undesirable
+impacts from false predictions, and hence facilitating real-world deployment
+soon. Code and training data are available at https://github.com/DLR-RM/GRACE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IROS 2023. First two authors share equal contribution</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fraunhofer SIT at CheckThat! 2023: Mixing Single-Modal Classifiers to
+  Estimate the Check-Worthiness of Multi-Modal Tweets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00610v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00610v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raphael Frick, Inna Vogel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The option of sharing images, videos and audio files on social media opens up
+new possibilities for distinguishing between false information and fake news on
+the Internet. Due to the vast amount of data shared every second on social
+media, not all data can be verified by a computer or a human expert. Here, a
+check-worthiness analysis can be used as a first step in the fact-checking
+pipeline and as a filtering mechanism to improve efficiency. This paper
+proposes a novel way of detecting the check-worthiness in multi-modal tweets.
+It takes advantage of two classifiers, each trained on a single modality. For
+image data, extracting the embedded text with an OCR analysis has shown to
+perform best. By combining the two classifiers, the proposed solution was able
+to place first in the CheckThat! 2023 Task 1A with an F1 score of 0.7297
+achieved on the private test set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fraunhofer SIT at CheckThat! 2023: Tackling Classification Uncertainty
+  Using Model Souping on the Example of Check-Worthiness Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02377v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02377v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raphael Frick, Inna Vogel, Jeong-Eun Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes the second-placed approach developed by the Fraunhofer
+SIT team in the CLEF-2023 CheckThat! lab Task 1B for English. Given a text
+snippet from a political debate, the aim of this task is to determine whether
+it should be assessed for check-worthiness. Detecting check-worthy statements
+aims to facilitate manual fact-checking efforts by prioritizing the claims that
+fact-checkers should consider first. It can also be considered as primary step
+of a fact-checking system. Our best-performing method took advantage of an
+ensemble classification scheme centered on Model Souping. When applied to the
+English data set, our submitted model achieved an overall F1 score of 0.878 and
+was ranked as the second-best model in the competition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differential Privacy for Clustering Under Continual Observation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03430v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03430v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Dupré la Tour, Monika Henzinger, David Saulpic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of clustering privately a dataset in $\mathbb{R}^d$
+that undergoes both insertion and deletion of points. Specifically, we give an
+$\varepsilon$-differentially private clustering mechanism for the $k$-means
+objective under continual observation. This is the first approximation
+algorithm for that problem with an additive error that depends only
+logarithmically in the number $T$ of updates. The multiplicative error is
+almost the same as non privately. To do so we show how to perform dimension
+reduction under continual observation and combine it with a differentially
+private greedy approximation algorithm for $k$-means. We also partially extend
+our results to the $k$-median problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analyzing Explainer Robustness via Lipschitzness of Prediction Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.12481v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.12481v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zulqarnain Khan, Davin Hill, Aria Masoomi, Joshua Bone, Jennifer Dy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning methods have significantly improved in their predictive
+capabilities, but at the same time they are becoming more complex and less
+transparent. As a result, explainers are often relied on to provide
+interpretability to these black-box prediction models. As crucial diagnostics
+tools, it is important that these explainers themselves are robust. In this
+paper we focus on one particular aspect of robustness, namely that an explainer
+should give similar explanations for similar data inputs. We formalize this
+notion by introducing and defining explainer astuteness, analogous to
+astuteness of prediction functions. Our formalism allows us to connect
+explainer robustness to the predictor's probabilistic Lipschitzness, which
+captures the probability of local smoothness of a function. We provide lower
+bound guarantees on the astuteness of a variety of explainers (e.g., SHAP,
+RISE, CXPlain) given the Lipschitzness of the prediction function. These
+theoretical results imply that locally smooth prediction functions lend
+themselves to locally robust explanations. We evaluate these results
+empirically on simulated as well as real datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Out-Of-Distribution Generalization: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.13624v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.13624v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiashuo Liu, Zheyan Shen, Yue He, Xingxuan Zhang, Renzhe Xu, Han Yu, Peng Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional machine learning paradigms are based on the assumption that both
+training and test data follow the same statistical pattern, which is
+mathematically referred to as Independent and Identically Distributed
+($i.i.d.$). However, in real-world applications, this $i.i.d.$ assumption often
+fails to hold due to unforeseen distributional shifts, leading to considerable
+degradation in model performance upon deployment. This observed discrepancy
+indicates the significance of investigating the Out-of-Distribution (OOD)
+generalization problem. OOD generalization is an emerging topic of machine
+learning research that focuses on complex scenarios wherein the distributions
+of the test data differ from those of the training data. This paper represents
+the first comprehensive, systematic review of OOD generalization, encompassing
+a spectrum of aspects from problem definition, methodological development, and
+evaluation procedures, to the implications and future directions of the field.
+Our discussion begins with a precise, formal characterization of the OOD
+generalization problem. Following that, we categorize existing methodologies
+into three segments: unsupervised representation learning, supervised model
+learning, and optimization, according to their positions within the overarching
+learning process. We provide an in-depth discussion on representative
+methodologies for each category, further elucidating the theoretical links
+between them. Subsequently, we outline the prevailing benchmark datasets
+employed in OOD generalization studies. To conclude, we overview the existing
+body of work in this domain and suggest potential avenues for future research
+on OOD generalization. A summary of the OOD generalization methodologies
+surveyed in this paper can be accessed at
+http://out-of-distribution-generalization.com.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>51 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Regulated Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1912.13122v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1912.13122v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrés García-Camino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Regulation of Multi-Agent Systems (MAS) and Declarative Electronic
+Institutions (DEIs) was a multidisciplinary research topic of the past decade
+involving (Physical and Software) Agents and Law since the beginning, but
+recently evolved towards News-claimed Robot Lawyer since 2016. One of these
+first proposals of restricting the behaviour of Software Agents was Electronic
+Institutions.However, with the recent reformulation of Artificial Neural
+Networks (ANNs) as Deep Learning (DL), Security, Privacy,Ethical and Legal
+issues regarding the use of DL has raised concerns in the Artificial
+Intelligence (AI) Community. Now that the Regulation of MAS is almost correctly
+addressed, we propose the Regulation of Artificial Neural Networks as
+Agent-based Training of a special type of regulated Artificial Neural Network
+that we call Institutional Neural Network (INN).The main purpose of this paper
+is to bring attention to Artificial Teaching (AT) and to give a tentative
+answer showing a proof-of-concept implementation of Regulated Deep Learning
+(RDL). This paper introduces the former concept and provide $I^*$, a language
+previously used to model declaratively and extend Electronic Institutions, as a
+means to regulate the execution of Artificial Neural Networks and their
+interactions with Artificial Teachers (ATs)
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In this version I added Goal Alignment</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Learning with a Reject Option: A <span class="highlight-title">survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2107.11277v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2107.11277v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kilian Hendrickx, Lorenzo Perini, Dries Van der Plas, Wannes Meert, Jesse Davis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models always make a prediction, even when it is likely to
+be inaccurate. This behavior should be avoided in many decision support
+applications, where mistakes can have severe consequences. Albeit already
+studied in 1970, machine learning with rejection recently gained interest. This
+machine learning subfield enables machine learning models to abstain from
+making a prediction when likely to make a mistake.
+  This survey aims to provide an overview on machine learning with rejection.
+We introduce the conditions leading to two types of rejection, ambiguity and
+novelty rejection, which we carefully formalize. Moreover, we review and
+categorize strategies to evaluate a model's predictive and rejective quality.
+Additionally, we define the existing architectures for models with rejection
+and describe the standard techniques for learning such models. Finally, we
+provide examples of relevant application domains and show how machine learning
+with rejection relates to other machine learning research areas.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Bradley-Terry Rating: Estimate Properties Without Metric of Unseen
+  Items 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13709v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13709v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satoru Fujii
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many properties in the real world, such as desirability or strength in
+competitive environment, can't be directly observed, which makes them difficult
+to evaluate. To deal with this challenging problem, prior works have primarily
+focused on estimating those properties of known items, especially the strength
+of sports players, only of those who appears in paired comparison dataset. In
+this paper, we introduce Deep Bradley-Terry Rating (DBTR), a novel ML framework
+to evaluate any properties of unknown items, not necessarily present in the
+training data. Our method seamlessly integrates traditional Bradley-Terry model
+with a neural network structure. We also generalizes this architecture further
+for asymmetric environment with unfairness, which is much more common in real
+world settings. In our experimental analysis, DBTR successfully learned desired
+quantification of those properties.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Pre-Train</span>ing with Diffusion models for Dental Radiography segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14066v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14066v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jérémy Rousseau, Christian Alaka, Emma Covili, Hippolyte Mayard, Laura Misrachi, Willy Au
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical radiography segmentation, and specifically dental radiography, is
+highly limited by the cost of labeling which requires specific expertise and
+labor-intensive annotations. In this work, we propose a straightforward
+pre-training method for semantic segmentation leveraging Denoising Diffusion
+Probabilistic Models (DDPM), which have shown impressive results for generative
+modeling. Our straightforward approach achieves remarkable performance in terms
+of label efficiency and does not require architectural modifications between
+pre-training and downstream tasks. We propose to first pre-train a Unet by
+exploiting the DDPM training objective, and then fine-tune the resulting model
+on a segmentation task. Our experimental results on the segmentation of dental
+radiographs demonstrate that the proposed method is competitive with
+state-of-the-art pre-training methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Experimental Study on Reinforcement Learning-based Control of an Acrobot 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2011.09246v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2011.09246v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leo Dostal, Alexej Bespalko, Daniel A. Duecker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present computational and experimental results on how artificial
+intelligence (AI) learns to control an Acrobot using reinforcement learning
+(RL). Thereby the experimental setup is designed as an embedded system, which
+is of interest for robotics and energy harvesting applications. Specifically,
+we study the control of angular velocity of the Acrobot, as well as control of
+its total energy, which is the sum of the kinetic and the potential energy. By
+this means the RL algorithm is designed to drive the angular velocity or the
+energy of the first pendulum of the Acrobot towards a desired value. With this,
+libration or full rotation of the unactuated pendulum of the Acrobot is
+achieved. Moreover, investigations of the Acrobot control are carried out,
+which lead to insights about the influence of the state space discretization,
+the episode length, the action space or the mass of the driven pendulum on the
+RL control. By further numerous simulations and experiments the effects of
+parameter variations are evaluated.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Learning the Tail Quantiles of Driving Behavior Distributions via
+  Quantile Regression and Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13106v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13106v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia Yu Tee, Oliver De Candido, Wolfgang Utschick, Philipp Geiger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Towards safe autonomous driving (AD), we consider the problem of learning
+models that accurately capture the diversity and tail quantiles of human driver
+behavior probability distributions, in interaction with an AD vehicle. Such
+models, which predict drivers' continuous actions from their states, are
+particularly relevant for closing the gap between AD agent simulations and
+reality. To this end, we adapt two flexible quantile learning frameworks for
+this setting that avoid strong distributional assumptions: (1) quantile
+regression (based on the titled absolute loss), and (2) autoregressive quantile
+flows (a version of normalizing flows). Training happens in a behavior
+cloning-fashion. We use the highD dataset consisting of driver trajectories on
+several highways. We evaluate our approach in a one-step acceleration
+prediction task, and in multi-step driver simulation rollouts. We report
+quantitative results using the tilted absolute loss as metric, give qualitative
+examples showing that realistic extremal behavior can be learned, and discuss
+the main insights.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Group Equivariant Fourier Neural Operators for Partial Differential
+  Equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05697v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05697v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacob Helwig, Xuan Zhang, Cong Fu, Jerry Kurtin, Stephan Wojtowytsch, Shuiwang Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider solving partial differential equations (PDEs) with Fourier neural
+operators (FNOs), which operate in the frequency domain. Since the laws of
+physics do not depend on the coordinate system used to describe them, it is
+desirable to encode such symmetries in the neural operator architecture for
+better performance and easier learning. While encoding symmetries in the
+physical domain using group theory has been studied extensively, how to capture
+symmetries in the frequency domain is under-explored. In this work, we extend
+group convolutions to the frequency domain and design Fourier layers that are
+equivariant to rotations, translations, and reflections by leveraging the
+equivariance property of the Fourier transform. The resulting $G$-FNO
+architecture generalizes well across input resolutions and performs well in
+settings with varying levels of symmetry. Our code is publicly available as
+part of the AIRS library (https://github.com/divelab/AIRS).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 40th International Conference on Machine Learning
+  https://icml.cc/virtual/2023/poster/23875</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Predicting Winning Regions in Parity Games via Graph Neural Networks
+  (Extended Abstract) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.09924v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.09924v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Hecking, Swathy Muthukrishnan, Alexander Weinert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solving parity games is a major building block for numerous applications in
+reactive program verification and synthesis. While they can be solved
+efficiently in practice, no known approach has a polynomial worst-case runtime
+complexity. We present a incomplete polynomial-time approach to determining the
+winning regions of parity games via graph neural networks.
+  Our evaluation on 900 randomly generated parity games shows that this
+approach is effective and efficient in practice. It correctly determines the
+winning regions of $\sim$60\% of the games in our data set and only incurs
+minor errors in the remaining ones. We believe that this approach can be
+extended to efficiently solve parity games as well.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, extended abstract. Presented at DAV'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non Intrusive Intelligibility Predictor for Hearing Impaired Individuals
+  using Self Supervised Speech Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13423v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13423v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George Close, Thomas Hain, Stefan Goetze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised speech representations (SSSRs) have been successfully applied
+to a number of speech-processing tasks, e.g. as feature extractor for speech
+quality (SQ) prediction, which is, in turn, relevant for assessment and
+training speech enhancement systems for users with normal or impaired hearing.
+However, exact knowledge of why and how quality-related information is encoded
+well in such representations remains poorly understood. In this work,
+techniques for non-intrusive prediction of SQ ratings are extended to the
+prediction of intelligibility for hearing-impaired users. It is found that
+self-supervised representations are useful as input features to non-intrusive
+prediction models, achieving competitive performance to more complex systems. A
+detailed analysis of the performance depending on Clarity Prediction Challenge
+1 listeners and enhancement systems indicates that more data might be needed to
+allow generalisation to unknown systems and (hearing-impaired) individuals
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Duet: efficient and scalable hybriD neUral rElation undersTanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13494v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13494v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaixin Zhang, Hongzhi Wang, Yabin Lu, Ziqi Li, Chang Shu, Yu Yan, Donghua Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learned cardinality estimation methods have achieved high precision compared
+to traditional methods. Among learned methods, query-driven approaches face the
+data and workload drift problem for a long time. Although both query-driven and
+hybrid methods are proposed to avoid this problem, even the state-of-art of
+them suffer from high training and estimation costs, limited scalability,
+instability, and long-tailed distribution problem on high cardinality and high
+dimensional tables, which seriously affects the practical application of
+learned cardinality estimators. In this paper, we prove that most of these
+problems are directly caused by the widely used progressive sampling. We solve
+this problem by introducing predicates into the autoregressive model and
+propose Duet, a stable, efficient, and scalable hybrid method to estimate
+cardinality directly without sampling or any non-differentiable process, which
+can not only reduces the inference complexity from $O(n)$ to $O(1)$ compared to
+Naru and UAE but also achieve higher accuracy on high cardinality and high
+dimensional tables. Experimental results show that Duet can achieve all the
+design goals above and be much more practical and even has a lower inference
+cost on CPU than that of most learned methods on GPU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and
+  Lane Segmentation in Self-Driving Cars 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10705v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10705v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quang Huy Che, Dinh Phuc Nguyen, Minh Quan Pham, Duc Khai Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation is a common task in autonomous driving to understand
+the surrounding environment. Driveable Area Segmentation and Lane Detection are
+particularly important for safe and efficient navigation on the road. However,
+original semantic segmentation models are computationally expensive and require
+high-end hardware, which is not feasible for embedded systems in autonomous
+vehicles. This paper proposes a lightweight model for the driveable area and
+lane line segmentation. TwinLiteNet is designed cheaply but achieves accurate
+and efficient segmentation results. We evaluate TwinLiteNet on the BDD100K
+dataset and compare it with modern models. Experimental results show that our
+TwinLiteNet performs similarly to existing approaches, requiring significantly
+fewer computational resources. Specifically, TwinLiteNet achieves a mIoU score
+of 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task
+with only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000.
+Furthermore, TwinLiteNet can run in real-time on embedded devices with limited
+computing power, especially since it achieves 60FPS on Jetson Xavier NX, making
+it an ideal solution for self-driving vehicles. Code is available:
+url{https://github.com/chequanghuy/TwinLiteNet}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted to the Conference on Multimedia
+  Analysis and Pattern Recognition (MAPR), which will be held in held in Quy
+  Nhon on October 5-6, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models Struggle to Learn Long-Tail Knowledge <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08411v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08411v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Kandpal, Haikang Deng, Adam Roberts, Eric Wallace, Colin Raffel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Internet contains a wealth of knowledge -- from the birthdays of
+historical figures to tutorials on how to code -- all of which may be learned
+by language models. However, while certain pieces of information are ubiquitous
+on the web, others appear extremely rarely. In this paper, we study the
+relationship between the knowledge memorized by large language models and the
+information in pre-training datasets scraped from the web. In particular, we
+show that a language model's ability to answer a fact-based question relates to
+how many documents associated with that question were seen during pre-training.
+We identify these relevant documents by entity linking pre-training datasets
+and counting documents that contain the same entities as a given
+question-answer pair. Our results demonstrate strong correlational and causal
+relationships between accuracy and relevant document count for numerous
+question answering datasets (e.g., TriviaQA), pre-training corpora (e.g.,
+ROOTS), and model sizes (e.g., 176B parameters). Moreover, while larger models
+are better at learning long-tail knowledge, we estimate that today's models
+must be scaled by many orders of magnitude to reach competitive QA performance
+on questions with little support in the pre-training data. Finally, we show
+that retrieval-augmentation can reduce the dependence on relevant pre-training
+information, presenting a promising approach for capturing the long-tail.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023 Camera Ready Version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Statistical process monitoring of artificial neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.07436v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.07436v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anna Malinovskaya, Pavlo Mozharovskyi, Philipp Otto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of models based on artificial intelligence demands
+innovative monitoring techniques which can operate in real time with low
+computational costs. In machine learning, especially if we consider artificial
+neural networks (ANNs), the models are often trained in a supervised manner.
+Consequently, the learned relationship between the input and the output must
+remain valid during the model's deployment. If this stationarity assumption
+holds, we can conclude that the ANN provides accurate predictions. Otherwise,
+the retraining or rebuilding of the model is required. We propose considering
+the latent feature representation of the data (called "embedding") generated by
+the ANN to determine the time when the data stream starts being nonstationary.
+In particular, we monitor embeddings by applying multivariate control charts
+based on the data depth calculation and normalized ranks. The performance of
+the introduced method is compared with benchmark approaches for various ANN
+architectures and different underlying data formats.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differential Convolutional Fuzzy Time Series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08890v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08890v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianxiang Zhan, Yuanpeng He, Yong Deng, Zhen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fuzzy time series forecasting (FTSF) is a typical forecasting method with
+wide application. Traditional FTSF is regarded as an expert system which leads
+to loss of the ability to recognize undefined features. The mentioned is the
+main reason for poor forecasting with FTSF. To solve the problem, the proposed
+model Differential Fuzzy Convolutional Neural Network (DFCNN) utilizes a
+convolution neural network to re-implement FTSF with learnable ability. DFCNN
+is capable of recognizing potential information and improving forecasting
+accuracy. Thanks to the learnable ability of the neural network, the length of
+fuzzy rules established in FTSF is expended to an arbitrary length that the
+expert is not able to handle by the expert system. At the same time, FTSF
+usually cannot achieve satisfactory performance of non-stationary time series
+due to the trend of non-stationary time series. The trend of non-stationary
+time series causes the fuzzy set established by FTSF to be invalid and causes
+the forecasting to fail. DFCNN utilizes the Difference algorithm to weaken the
+non-stationary of time series so that DFCNN can forecast the non-stationary
+time series with a low error that FTSF cannot forecast in satisfactory
+performance. After the mass of experiments, DFCNN has an excellent prediction
+effect, which is ahead of the existing FTSF and common time series forecasting
+algorithms. Finally, DFCNN provides further ideas for improving FTSF and holds
+continued research value.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scalable Bayesian Uncertainty Quantification for Neural Network
+  Potentials: Promise and Pitfalls 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.07959v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.07959v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephan Thaler, Gregor Doehner, Julija Zavadlav
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural network (NN) potentials promise highly accurate molecular dynamics
+(MD) simulations within the computational complexity of classical MD force
+fields. However, when applied outside their training domain, NN potential
+predictions can be inaccurate, increasing the need for Uncertainty
+Quantification (UQ). Bayesian modeling provides the mathematical framework for
+UQ, but classical Bayesian methods based on Markov chain Monte Carlo (MCMC) are
+computationally intractable for NN potentials. By training graph NN potentials
+for coarse-grained systems of liquid water and alanine dipeptide, we
+demonstrate here that scalable Bayesian UQ via stochastic gradient MCMC
+(SG-MCMC) yields reliable uncertainty estimates for MD observables. We show
+that cold posteriors can reduce the required training data size and that for
+reliable UQ, multiple Markov chains are needed. Additionally, we find that
+SG-MCMC and the Deep Ensemble method achieve comparable results, despite
+shorter training and less hyperparameter tuning of the latter. We show that
+both methods can capture aleatoric and epistemic uncertainty reliably, but not
+systematic uncertainty, which needs to be minimized by adequate modeling to
+obtain accurate credible intervals for MD observables. Our results represent a
+step towards accurate UQ that is of vital importance for trustworthy NN
+potential-based MD simulations required for decision-making in practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is a post-peer-review, pre-copyedit version of an article
+  published in the Journal of Chemical Theory and Computation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automating Model Comparison in Factor Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05965v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05965v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bart van Erp, Wouter W. L. Nuijten, Thijs van de Laar, Bert de Vries
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian state and parameter estimation have been automated effectively in a
+variety of probabilistic programming languages. The process of model comparison
+on the other hand, which still requires error-prone and time-consuming manual
+derivations, is often overlooked despite its importance. This paper efficiently
+automates Bayesian model averaging, selection, and combination by message
+passing on a Forney-style factor graph with a custom mixture node. Parameter
+and state inference, and model comparison can then be executed simultaneously
+using message passing with scale factors. This approach shortens the model
+design cycle and allows for the straightforward extension to hierarchical and
+temporal model priors to accommodate for modeling complicated time-varying
+processes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CAR-DESPOT: Causally-Informed Online POMDP Planning for Robots in
+  Confounded Environments <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06848v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06848v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ricardo Cannizzaro, Lars Kunze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robots operating in real-world environments must reason about possible
+outcomes of stochastic actions and make decisions based on partial observations
+of the true world state. A major challenge for making accurate and robust
+action predictions is the problem of confounding, which if left untreated can
+lead to prediction errors. The partially observable Markov decision process
+(POMDP) is a widely-used framework to model these stochastic and
+partially-observable decision-making problems. However, due to a lack of
+explicit causal semantics, POMDP planning methods are prone to confounding bias
+and thus in the presence of unobserved confounders may produce underperforming
+policies. This paper presents a novel causally-informed extension of "anytime
+regularized determinized sparse partially observable tree" (AR-DESPOT), a
+modern anytime online POMDP planner, using causal modelling and inference to
+eliminate errors caused by unmeasured confounder variables. We further propose
+a method to learn offline the partial parameterisation of the causal model for
+planning, from ground truth model data. We evaluate our methods on a toy
+problem with an unobserved confounder and show that the learned causal model is
+highly accurate, while our planning method is more robust to confounding and
+produces overall higher performing policies than AR-DESPOT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures, accepted to 2023 IEEE/RSJ International
+  Conference on Intelligent Robots and Systems (IROS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Common Rationale to Improve <span class="highlight-title">Self-Supervised</span> Representation for
+  Fine-Grained Visual Recognition Problems <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01669v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01669v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangyang Shu, Anton van den Hengel, Lingqiao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) strategies have demonstrated remarkable
+performance in various recognition tasks. However, both our preliminary
+investigation and recent studies suggest that they may be less effective in
+learning representations for fine-grained visual recognition (FGVR) since many
+features helpful for optimizing SSL objectives are not suitable for
+characterizing the subtle differences in FGVR. To overcome this issue, we
+propose learning an additional screening mechanism to identify discriminative
+clues commonly seen across instances and classes, dubbed as common rationales
+in this paper. Intuitively, common rationales tend to correspond to the
+discriminative patterns from the key parts of foreground objects. We show that
+a common rationale detector can be learned by simply exploiting the GradCAM
+induced from the SSL objective without using any pre-trained object parts or
+saliency detectors, making it seamlessly to be integrated with the existing SSL
+process. Specifically, we fit the GradCAM with a branch with limited fitting
+capacity, which allows the branch to capture the common rationales and discard
+the less common discriminative patterns. At the test stage, the branch
+generates a set of spatial weights to selectively aggregate features
+representing an instance. Extensive experimental results on four visual tasks
+demonstrate that the proposed method can lead to a significant improvement in
+different evaluation settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ <span class="highlight-title">★</span> MixupE: Understanding and Improving Mixup from Directional Derivative
+  Perspective <span class="chip">UAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.13381v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.13381v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingtian Zou, Vikas Verma, Sarthak Mittal, Wai Hoh Tang, Hieu Pham, Juho Kannala, <span class="highlight-author">Yoshua Bengio</span>, Arno Solin, Kenji Kawaguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mixup is a popular data augmentation technique for training deep neural
+networks where additional samples are generated by linearly interpolating pairs
+of inputs and their labels. This technique is known to improve the
+generalization performance in many learning paradigms and applications. In this
+work, we first analyze Mixup and show that it implicitly regularizes infinitely
+many directional derivatives of all orders. Based on this new insight, we
+propose an improved version of Mixup, theoretically justified to deliver better
+generalization performance than the vanilla Mixup. To demonstrate the
+effectiveness of the proposed method, we conduct experiments across various
+domains such as images, tabular data, speech, and graphs. Our results show that
+the proposed method improves Mixup across multiple datasets using a variety of
+architectures, for instance, exhibiting an improvement over Mixup by 0.8% in
+ImageNet top-1 accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, UAI 2023 oral presentation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VeML: An End-to-End Machine Learning Lifecycle for Large-scale and
+  High-dimensional Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13037v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13037v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Van-Duc Le, Cuong-Tien Bui, Wen-Syan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An end-to-end machine learning (ML) lifecycle consists of many iterative
+processes, from data preparation and ML model design to model training and then
+deploying the trained model for inference. When building an end-to-end
+lifecycle for an ML problem, many ML pipelines must be designed and executed
+that produce a huge number of lifecycle versions. Therefore, this paper
+introduces VeML, a Version management system dedicated to end-to-end ML
+Lifecycle. Our system tackles several crucial problems that other systems have
+not solved. First, we address the high cost of building an ML lifecycle,
+especially for large-scale and high-dimensional dataset. We solve this problem
+by proposing to transfer the lifecycle of similar datasets managed in our
+system to the new training data. We design an algorithm based on the core set
+to compute similarity for large-scale, high-dimensional data efficiently.
+Another critical issue is the model accuracy degradation by the difference
+between training data and testing data during the ML lifetime, which leads to
+lifecycle rebuild. Our system helps to detect this mismatch without getting
+labeled data from testing data and rebuild the ML lifecycle for a new data
+version. To demonstrate our contributions, we conduct experiments on
+real-world, large-scale datasets of driving images and spatiotemporal sensor
+data and show promising results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The updated version of this paper, titled "Efficient ML Lifecycle
+  Transferring for Large-scale and High-dimensional Data via Core Set-based
+  Dataset Similarity," has been accepted for publication in IEEE Access</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contrastive Domain Adaptation for Time-Series via Temporal Mixup 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.01555v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.01555v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emadeldeen Eldele, Mohamed Ragab, Zhenghua Chen, Min Wu, Chee-Keong Kwoh, Xiaoli Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised Domain Adaptation (UDA) has emerged as a powerful solution for
+the domain shift problem via transferring the knowledge from a labeled source
+domain to a shifted unlabeled target domain. Despite the prevalence of UDA for
+visual applications, it remains relatively less explored for time-series
+applications. In this work, we propose a novel lightweight contrastive domain
+adaptation framework called CoTMix for time-series data. Unlike existing
+approaches that either use statistical distances or adversarial techniques, we
+leverage contrastive learning solely to mitigate the distribution shift across
+the different domains. Specifically, we propose a novel temporal mixup strategy
+to generate two intermediate augmented views for the source and target domains.
+Subsequently, we leverage contrastive learning to maximize the similarity
+between each domain and its corresponding augmented view. The generated views
+consider the temporal dynamics of time-series data during the adaptation
+process while inheriting the semantics among the two domains. Hence, we
+gradually push both domains towards a common intermediate space, mitigating the
+distribution shift across them. Extensive experiments conducted on five
+real-world time-series datasets show that our approach can significantly
+outperform all state-of-the-art UDA methods. The implementation code of CoTMix
+is available at
+\href{https://github.com/emadeldeen24/CoTMix}{github.com/emadeldeen24/CoTMix}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in the IEEE Transactions on Artificial Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Factor Fields: A Unified Framework for Neural Fields and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01226v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01226v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anpei Chen, Zexiang Xu, Xinyue Wei, Siyu Tang, Hao Su, Andreas Geiger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Factor Fields, a novel framework for modeling and representing
+signals. Factor Fields decomposes a signal into a product of factors, each
+represented by a classical or neural field representation which operates on
+transformed input coordinates. This decomposition results in a unified
+framework that accommodates several recent signal representations including
+NeRF, Plenoxels, EG3D, Instant-NGP, and TensoRF. Additionally, our framework
+allows for the creation of powerful new signal representations, such as the
+"Dictionary Field" (DiF) which is a second contribution of this paper. Our
+experiments show that DiF leads to improvements in approximation quality,
+compactness, and training time when compared to previous fast reconstruction
+methods. Experimentally, our representation achieves better image approximation
+quality on 2D image regression tasks, higher geometric quality when
+reconstructing 3D signed distance fields, and higher compactness for radiance
+field reconstruction tasks. Furthermore, DiF enables generalization to unseen
+images/3D scenes by sharing bases across signals during training which greatly
+benefits use cases such as image regression from sparse observations and
+few-shot radiance field reconstruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures; Project Page:
+  https://apchenstu.github.io/FactorFields/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Empirical analysis of Different Dimensionality Reduction and
+  classification Techniques for Epileptic Seizure detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12012v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12012v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rabel Guharoy, Nanda Dulal Jana, Suparna Biswas, Lalit Garg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An Electroencephalogram (EEG) is a non-invasive exam that records the
+electrical activity of the brain. This exam is used to help diagnose conditions
+such as different brain problems. EEG signals are taken for the purpose of
+epilepsy detection and with Discrete Wavelet Transform (DWT) and machine
+learning classifier, they perform epilepsy detection. In Epilepsy seizure
+detection, mainly machine learning classifiers and statistical features are
+used. The hidden information in the EEG signal is useful for detecting diseases
+affecting the brain. Sometimes it is very difficult to identify the minimum
+changes in the EEG in the time and frequency domains purpose. The DWT can give
+a good decomposition of the signals in different frequency bands and feature
+extraction. We use the tri-dimensionality reduction algorithm.; Principal
+Component Analysis (PCA), Independent Component Analysis (ICA), and Linear
+Discriminant Analysis (LDA). Finally, features are selected by using a fusion
+rule and at the last step three different classifiers Support Vector Machine
+(SVM), Naive Bayes (NB) and K-Nearest-Neighbor(KNN) have been used individually
+for the classification. The proposed framework is tested on the Bonn dataset
+and the simulation results provide the accuracy for the combination of LDA and
+SVM 89.17%, LDA and KNN 80.42%, PCA and NB 89.92%, PCA and SVM 85.58%, PCA and
+KNN 80.42%, ICA and NB 82.33%, ICA and SVM 90.42%, and ICA and KNN 90%, LDA and
+NB 100%, accuracy. It shows the sensitivity, specificity, accuracy, Precision,
+and Recall of 100%, 100%, 100%, 100%, and 100%. This combination of LDA with NB
+method provides the accuracy of 100% outperforming all existing methods. The
+results prove the effectiveness of this model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Securing Secure Aggregation: Mitigating Multi-Round Privacy Leakage in
+  Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.03328v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.03328v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinhyun So, Ramy E. Ali, Basak Guler, Jiantao Jiao, Salman Avestimehr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Secure aggregation is a critical component in federated learning (FL), which
+enables the server to learn the aggregate model of the users without observing
+their local models. Conventionally, secure aggregation algorithms focus only on
+ensuring the privacy of individual users in a single training round. We contend
+that such designs can lead to significant privacy leakages over multiple
+training rounds, due to partial user selection/participation at each round of
+FL. In fact, we show that the conventional random user selection strategies in
+FL lead to leaking users' individual models within number of rounds that is
+linear in the number of users. To address this challenge, we introduce a secure
+aggregation framework, Multi-RoundSecAgg, with multi-round privacy guarantees.
+In particular, we introduce a new metric to quantify the privacy guarantees of
+FL over multiple training rounds, and develop a structured user selection
+strategy that guarantees the long-term privacy of each user (over any number of
+training rounds). Our framework also carefully accounts for the fairness and
+the average number of participating users at each round. Our experiments on
+MNIST and CIFAR-10 datasets in the IID and the non-IID settings demonstrate the
+performance improvement over the baselines, both in terms of privacy protection
+and test accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploiting Richness of Learned Compressed Representation of Images for
+  Semantic Segmentation <span class="chip">ICME 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01524v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01524v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ravi Kakaiya, Rakshith Sathish, Ramanathan Sethuraman, Debdoot Sheet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous vehicles and Advanced Driving Assistance Systems (ADAS) have the
+potential to radically change the way we travel. Many such vehicles currently
+rely on segmentation and object detection algorithms to detect and track
+objects around its surrounding. The data collected from the vehicles are often
+sent to cloud servers to facilitate continual/life-long learning of these
+algorithms. Considering the bandwidth constraints, the data is compressed
+before sending it to servers, where it is typically decompressed for training
+and analysis. In this work, we propose the use of a learning-based compression
+Codec to reduce the overhead in latency incurred for the decompression
+operation in the standard pipeline. We demonstrate that the learned compressed
+representation can also be used to perform tasks like semantic segmentation in
+addition to decompression to obtain the images. We experimentally validate the
+proposed pipeline on the Cityscapes dataset, where we achieve a compression
+factor up to $66 \times$ while preserving the information required to perform
+segmentation with a dice coefficient of $0.84$ as compared to $0.88$ achieved
+using decompressed images while reducing the overall compute by $11\%$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICME 2023 (Industry Track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TimeTuner: Diagnosing Time Representations for Time-Series Forecasting
+  with Counterfactual Explanations <span class="chip">IEEE VIS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09916v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09916v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianing Hao, Qing Shi, Yilin Ye, Wei Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) approaches are being increasingly used for time-series
+forecasting, with many efforts devoted to designing complex DL models. Recent
+studies have shown that the DL success is often attributed to effective data
+representations, fostering the fields of feature engineering and representation
+learning. However, automated approaches for feature learning are typically
+limited with respect to incorporating prior knowledge, identifying interactions
+among variables, and choosing evaluation metrics to ensure that the models are
+reliable. To improve on these limitations, this paper contributes a novel
+visual analytics framework, namely TimeTuner, designed to help analysts
+understand how model behaviors are associated with localized correlations,
+stationarity, and granularity of time-series representations. The system mainly
+consists of the following two-stage technique: We first leverage counterfactual
+explanations to connect the relationships among time-series representations,
+multivariate features and model predictions. Next, we design multiple
+coordinated views including a partition-based correlation matrix and juxtaposed
+bivariate stripes, and provide a set of interactions that allow users to step
+into the transformation selection process, navigate through the feature space,
+and reason the model performance. We instantiate TimeTuner with two
+transformation methods of smoothing and sampling, and demonstrate its
+applicability on real-world time-series forecasting of univariate sunspots and
+multivariate air pollutants. Feedback from domain experts indicates that our
+system can help characterize time-series representations and guide the feature
+engineering processes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 9 figures, this paper has been accepted by IEEE VIS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupling Knowledge from Memorization: Retrieval-augmented <span class="highlight-title">Prompt</span>
+  Learning <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.14704v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.14704v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Chen, Lei Li, Ningyu Zhang, Xiaozhuan Liang, Shumin Deng, Chuanqi Tan, Fei Huang, Luo Si, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt learning approaches have made waves in natural language processing by
+inducing better few-shot performance while they still follow a parametric-based
+learning paradigm; the oblivion and rote memorization problems in learning may
+encounter unstable generalization issues. Specifically, vanilla prompt learning
+may struggle to utilize atypical instances by rote during fully-supervised
+training or overfit shallow patterns with low-shot data. To alleviate such
+limitations, we develop RetroPrompt with the motivation of decoupling knowledge
+from memorization to help the model strike a balance between generalization and
+memorization. In contrast with vanilla prompt learning, RetroPrompt constructs
+an open-book knowledge-store from training instances and implements a retrieval
+mechanism during the process of input, training and inference, thus equipping
+the model with the ability to retrieve related contexts from the training
+corpus as cues for enhancement. Extensive experiments demonstrate that
+RetroPrompt can obtain better performance in both few-shot and zero-shot
+settings. Besides, we further illustrate that our proposed RetroPrompt can
+yield better generalization abilities with new datasets. Detailed analysis of
+memorization indeed reveals RetroPrompt can reduce the reliance of language
+models on memorization; thus, improving generalization for downstream tasks.
+Code is available in
+https://github.com/zjunlp/PromptKG/tree/main/research/RetroPrompt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2022 (Spotlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedFTN: Personalized Federated Learning with Deep Feature Transformation
+  Network for Multi-institutional Low-count PET Denoising 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.00570v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.00570v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Zhou, Huidong Xie, Qiong Liu, Xiongchao Chen, Xueqi Guo, Zhicheng Feng, S. Kevin Zhou, Biao Li, Axel Rominger, Kuangyu Shi, James S. Duncan, Chi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-count PET is an efficient way to reduce radiation exposure and
+acquisition time, but the reconstructed images often suffer from low
+signal-to-noise ratio (SNR), thus affecting diagnosis and other downstream
+tasks. Recent advances in deep learning have shown great potential in improving
+low-count PET image quality, but acquiring a large, centralized, and diverse
+dataset from multiple institutions for training a robust model is difficult due
+to privacy and security concerns of patient data. Moreover, low-count PET data
+at different institutions may have different data distribution, thus requiring
+personalized models. While previous federated learning (FL) algorithms enable
+multi-institution collaborative training without the need of aggregating local
+data, addressing the large domain shift in the application of
+multi-institutional low-count PET denoising remains a challenge and is still
+highly under-explored. In this work, we propose FedFTN, a personalized
+federated learning strategy that addresses these challenges. FedFTN uses a
+local deep feature transformation network (FTN) to modulate the feature outputs
+of a globally shared denoising network, enabling personalized low-count PET
+denoising for each institution. During the federated learning process, only the
+denoising network's weights are communicated and aggregated, while the FTN
+remains at the local institutions for feature transformation. We evaluated our
+method using a large-scale dataset of multi-institutional low-count PET imaging
+data from three medical centers located across three continents, and showed
+that FedFTN provides high-quality low-count PET images, outperforming previous
+baseline FL reconstruction methods across all low-count levels at all three
+institutions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Weight Balancing on Long-Tailed Recognition Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16573v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16573v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naoya Hasegawa, Issei Sato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recognition problems in long-tailed data, where the sample size per class is
+heavily skewed, have recently gained importance because the distribution of the
+sample size per class in a dataset is generally exponential unless the sample
+size is intentionally adjusted. Various approaches have been devised to address
+these problems. Recently, weight balancing, which combines well-known classical
+regularization techniques with two-stage training, has been proposed. Despite
+its simplicity, it is known for its high performance against existing methods
+devised in various ways. However, there is a lack of understanding as to why
+this approach is effective for long-tailed data. In this study, we analyze the
+method focusing on neural collapse and cone effect at each training stage and
+find that it can be decomposed into the increase in Fisher's discriminant ratio
+of the feature extractor caused by weight decay and cross entropy loss and
+implicit logit adjustment caused by weight decay and class-balanced loss. Our
+analysis shows that the training method can be further simplified by reducing
+the number of training stages to one while increasing accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 2D-Shapley: A Framework for Fragmented Data Valuation <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10473v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10473v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihong Liu, Hoang Anh Just, Xiangyu Chang, Xi Chen, Ruoxi Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data valuation -- quantifying the contribution of individual data sources to
+certain predictive behaviors of a model -- is of great importance to enhancing
+the transparency of machine learning and designing incentive systems for data
+sharing. Existing work has focused on evaluating data sources with the shared
+feature or sample space. How to valuate fragmented data sources of which each
+only contains partial features and samples remains an open question. We start
+by presenting a method to calculate the counterfactual of removing a fragment
+from the aggregated data matrix. Based on the counterfactual calculation, we
+further propose 2D-Shapley, a theoretical framework for fragmented data
+valuation that uniquely satisfies some appealing axioms in the fragmented data
+context. 2D-Shapley empowers a range of new use cases, such as selecting useful
+data fragments, providing interpretation for sample-wise data values, and
+fine-grained data issue diagnosis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 13 figures, ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Approximations of Complete Interatomic Potentials for Crystal
+  Property Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10045v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10045v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchao Lin, Keqiang Yan, Youzhi Luo, Yi Liu, Xiaoning Qian, Shuiwang Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study property prediction for crystal materials. A crystal structure
+consists of a minimal unit cell that is repeated infinitely in 3D space. How to
+accurately represent such repetitive structures in machine learning models
+remains unresolved. Current methods construct graphs by establishing edges only
+between nearby nodes, thereby failing to faithfully capture infinite repeating
+patterns and distant interatomic interactions. In this work, we propose several
+innovations to overcome these limitations. First, we propose to model
+physics-principled interatomic potentials directly instead of only using
+distances as in many existing methods. These potentials include the Coulomb
+potential, London dispersion potential, and Pauli repulsion potential. Second,
+we model the complete set of potentials among all atoms, instead of only
+between nearby atoms as in existing methods. This is enabled by our
+approximations of infinite potential summations with provable error bounds. We
+further develop efficient algorithms to compute the approximations. Finally, we
+propose to incorporate our computations of complete interatomic potentials into
+message passing neural networks for representation learning. We perform
+experiments on the JARVIS and Materials Project benchmarks for evaluation.
+Results show that the use of interatomic potentials and complete interatomic
+potentials leads to consistent performance improvements with reasonable
+computational costs. Our code is publicly available as part of the AIRS library
+(https://github.com/divelab/AIRS/tree/main/OpenMat/PotNet).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Algorithmic Hallucinations of Near-Surface Winds: Statistical
+  Downscaling with Generative Adversarial Networks to Convection-Permitting
+  Scales 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08720v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08720v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolaas J. Annau, Alex J. Cannon, Adam H. Monahan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the application of emerging machine learning methods from
+image super-resolution (SR) to the task of statistical downscaling. We
+specifically focus on convolutional neural network-based Generative Adversarial
+Networks (GANs). Our GANs are conditioned on low-resolution (LR) inputs to
+generate high-resolution (HR) surface winds emulating Weather Research and
+Forecasting (WRF) model simulations over North America. Unlike traditional SR
+models, where LR inputs are idealized coarsened versions of the HR images, WRF
+emulation involves using non-idealized LR and HR pairs resulting in
+shared-scale mismatches due to internal variability. Our study builds upon
+current SR-based statistical downscaling by experimenting with a novel
+frequency-separation (FS) approach from the computer vision field. To assess
+the skill of SR models, we carefully select evaluation metrics, and focus on
+performance measures based on spatial power spectra. Our analyses reveal how
+GAN configurations influence spatial structures in the generated fields,
+particularly biases in spatial variability spectra. Using power spectra to
+evaluate the FS experiments reveals that successful applications of FS in
+computer vision do not translate to climate fields. However, the FS experiments
+demonstrate the sensitivity of power spectra to a commonly used GAN-based SR
+objective function, which helps interpret and understand its role in
+determining spatial structures. This result motivates the development of a
+novel partial frequency-separation scheme as a promising configuration option.
+We also quantify the influence on GAN performance of non-idealized LR fields
+resulting from internal variability. Furthermore, we conduct a spectra-based
+feature-importance experiment allowing us to explore the dependence of the
+spatial structure of generated fields on different physically relevant LR
+covariates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages, including 11 main figures, and 16 supplemental figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Better Generalization with Flexible Representation of
+  Multi-Module Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.06589v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.06589v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyungeun Lee, Kijung Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have become compelling models designed to
+perform learning and inference on graph-structured data. However, little work
+has been done to understand the fundamental limitations of GNNs for scaling to
+larger graphs and generalizing to out-of-distribution (OOD) inputs. In this
+paper, we use a random graph generator to systematically investigate how the
+graph size and structural properties affect the predictive performance of GNNs.
+We present specific evidence that the average node degree is a key feature in
+determining whether GNNs can generalize to unseen graphs, and that the use of
+multiple node update functions can improve the generalization performance of
+GNNs when dealing with graphs of multimodal degree distributions. Accordingly,
+we propose a multi-module GNN framework that allows the network to adapt
+flexibly to new graphs by generalizing a single canonical nonlinear
+transformation over aggregated inputs. Our results show that the multi-module
+GNNs improve the OOD generalization on a variety of inference tasks in the
+direction of diverse structural features.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantifying & Modeling Multimodal Interactions: An Information
+  Decomposition Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12247v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12247v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Pu Liang, Yun Cheng, Xiang Fan, Chun Kai Ling, Suzanne Nie, Richard Chen, Zihao Deng, Nicholas Allen, Randy Auerbach, Faisal Mahmood, Ruslan Salakhutdinov, Louis-Philippe Morency
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent explosion of interest in multimodal applications has resulted in a
+wide selection of datasets and methods for representing and integrating
+information from different modalities. Despite these empirical advances, there
+remain fundamental research questions: How can we quantify the interactions
+that are necessary to solve a multimodal task? Subsequently, what are the most
+suitable multimodal models to capture these interactions? To answer these
+questions, we propose an information-theoretic approach to quantify the degree
+of redundancy, uniqueness, and synergy relating input modalities with an output
+task. We term these three measures as the PID statistics of a multimodal
+distribution (or PID for short), and introduce two new estimators for these PID
+statistics that scale to high-dimensional distributions. To validate PID
+estimation, we conduct extensive experiments on both synthetic datasets where
+the PID is known and on large-scale multimodal benchmarks where PID estimations
+are compared with human annotations. Finally, we demonstrate their usefulness
+in (1) quantifying interactions within multimodal datasets, (2) quantifying
+interactions captured by multimodal models, (3) principled approaches for model
+selection, and (4) three real-world case studies engaging with domain experts
+in pathology, mood prediction, and robotic perception where our framework helps
+to recommend strong multimodal models for each application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at: https://github.com/pliang279/PID</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning a Discrete Set of Optimal Allocation Rules in a Queueing System
+  with Unknown Service Rate 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.02419v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.02419v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saghar Adler, Mehrdad Moharrami, Vijay Subramanian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the wide range of modern applications of the Erlang-B blocking
+model beyond communication networks and call centers to sizing and pricing in
+design production systems, messaging systems, and app-based parking systems, we
+study admission control for such a system but with unknown arrival and service
+rates. In our model, at every job arrival, a dispatcher decides to assign the
+job to an available server or block it. Every served job yields a fixed reward
+for the dispatcher, but it also results in a cost per unit time of service. Our
+goal is to design a dispatching policy that maximizes the long-term average
+reward for the dispatcher based on observing only the arrival times and the
+state of the system at each arrival that reflects a realistic sampling of such
+systems. Critically, the dispatcher observes neither the service times nor
+departure times so that standard reinforcement learning-based approaches that
+use reward signals do not apply. Hence, we develop our learning-based dispatch
+scheme as a parametric learning problem a'la self-tuning adaptive control. In
+our problem, certainty equivalent control switches between an always admit if
+room policy (explore infinitely often) and a never admit policy (immediately
+terminate learning), which is distinct from the adaptive control literature.
+Hence, our learning scheme judiciously uses the always admit if room policy so
+that learning doesn't stall. We prove that for all service rates, the proposed
+policy asymptotically learns to take the optimal action and present finite-time
+regret guarantees. The extreme contrast in the certainty equivalent optimal
+control policies leads to difficulties in learning that show up in our regret
+bounds for different parameter regimes: constant regret in one regime versus
+regret growing logarithmically in the other.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conformal prediction for frequency-severity modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13124v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13124v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Helton Graziadei, Paulo C. Marques F., Eduardo F. L. de Melo, Rodrigo S. Targino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a nonparametric model-agnostic framework for building prediction
+intervals of insurance claims, with finite sample statistical guarantees,
+extending the technique of split conformal prediction to the domain of
+two-stage frequency-severity modeling. The effectiveness of the framework is
+showcased with simulated and real datasets. When the underlying severity model
+is a random forest, we extend the two-stage split conformal prediction
+procedure, showing how the out-of-bag mechanism can be leveraged to eliminate
+the need for a calibration set and to enable the production of prediction
+intervals with adaptive width.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VISAR: A Human-AI Argumentative Writing Assistant with Visual
+  Programming and Rapid Draft Prototyping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07810v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07810v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Zhang, Jie Gao, Ranjodh Singh Dhaliwal, Toby Jia-Jun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In argumentative writing, writers must brainstorm hierarchical writing goals,
+ensure the persuasiveness of their arguments, and revise and organize their
+plans through drafting. Recent advances in large language models (LLMs) have
+made interactive text generation through a chat interface (e.g., ChatGPT)
+possible. However, this approach often neglects implicit writing context and
+user intent, lacks support for user control and autonomy, and provides limited
+assistance for sensemaking and revising writing plans. To address these
+challenges, we introduce VISAR, an AI-enabled writing assistant system designed
+to help writers brainstorm and revise hierarchical goals within their writing
+context, organize argument structures through synchronized text editing and
+visual programming, and enhance persuasiveness with argumentation spark
+recommendations. VISAR allows users to explore, experiment with, and validate
+their writing plans using automatic draft prototyping. A controlled lab study
+confirmed the usability and effectiveness of VISAR in facilitating the
+argumentative writing planning process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, published in UIST'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiME: Maximizing Mutual Information by a Difference of Matrix-Based
+  Entropies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08164v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08164v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oscar Skean, Jhoan Keider Hoyos Osorio, Austin J. Brockmeier, Luis Gonzalo Sanchez Giraldo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce an information-theoretic quantity with similar properties to
+mutual information that can be estimated from data without making explicit
+assumptions on the underlying distribution. This quantity is based on a
+recently proposed matrix-based entropy that uses the eigenvalues of a
+normalized Gram matrix to compute an estimate of the eigenvalues of an
+uncentered covariance operator in a reproducing kernel Hilbert space. We show
+that a difference of matrix-based entropies (DiME) is well suited for problems
+involving the maximization of mutual information between random variables.
+While many methods for such tasks can lead to trivial solutions, DiME naturally
+penalizes such outcomes. We compare DiME to several baseline estimators of
+mutual information on a toy Gaussian dataset. We provide examples of use cases
+for DiME, such as latent factor disentanglement and a multiview representation
+learning problem where DiME is used to learn a shared representation among
+views with high mutual information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Answering Climate Questionnaires from Unstructured Climate
+  Reports 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.04253v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.04253v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Spokoyny, Tanmay Laud, Tom Corringham, Taylor Berg-Kirkpatrick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The topic of Climate Change (CC) has received limited attention in NLP
+despite its urgency. Activists and policymakers need NLP tools to effectively
+process the vast and rapidly growing unstructured textual climate reports into
+structured form. To tackle this challenge we introduce two new large-scale
+climate questionnaire datasets and use their existing structure to train
+self-supervised models. We conduct experiments to show that these models can
+learn to generalize to climate disclosures of different organizations types
+than seen during training. We then use these models to help align texts from
+unstructured climate documents to the semi-structured questionnaires in a human
+pilot study. Finally, to support further NLP research in the climate domain we
+introduce a benchmark of existing climate text classification datasets to
+better evaluate and compare existing models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">6</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Visual Acoustic Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15064v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15064v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arjun Somayazulu, Changan Chen, Kristen Grauman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Acoustic matching aims to re-synthesize an audio clip to sound as if it were
+recorded in a target acoustic environment. Existing methods assume access to
+paired training data, where the audio is observed in both source and target
+environments, but this limits the diversity of training data or requires the
+use of simulated data or heuristics to create paired samples. We propose a
+self-supervised approach to visual acoustic matching where training samples
+include only the target scene image and audio -- without acoustically
+mismatched source audio for reference. Our approach jointly learns to
+disentangle room acoustics and re-synthesize audio into the target environment,
+via a conditional GAN framework and a novel metric that quantifies the level of
+residual acoustic information in the de-biased audio. Training with either
+in-the-wild web data or simulated data, we demonstrate it outperforms the
+state-of-the-art on multiple challenging datasets and a wide variety of
+real-world audio and environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FLiCR: A Fast and Lightweight LiDAR Point Cloud Compression Based on
+  Lossy RI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jin Heo, Christopher Phillips, Ada Gavrilovska
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Light detection and ranging (LiDAR) sensors are becoming available on modern
+mobile devices and provide a 3D sensing capability. This new capability is
+beneficial for perceptions in various use cases, but it is challenging for
+resource-constrained mobile devices to use the perceptions in real-time because
+of their high computational complexity. In this context, edge computing can be
+used to enable LiDAR online perceptions, but offloading the perceptions on the
+edge server requires a low-latency, lightweight, and efficient compression due
+to the large volume of LiDAR point clouds data.
+  This paper presents FLiCR, a fast and lightweight LiDAR point cloud
+compression method for enabling edge-assisted online perceptions. FLiCR is
+based on range images (RI) as an intermediate representation (IR), and
+dictionary coding for compressing RIs. FLiCR achieves its benefits by
+leveraging lossy RIs, and we show the efficiency of bytestream compression is
+largely improved with quantization and subsampling. In addition, we identify
+the limitation of current quality metrics for presenting the entropy of a point
+cloud, and introduce a new metric that reflects both point-wise and
+entropy-wise qualities for lossy IRs. The evaluation results show FLiCR is more
+suitable for edge-assisted real-time perceptions than the existing LiDAR
+compressions, and we demonstrate the effectiveness of our compression and
+metric with the evaluations on 3D object detection and LiDAR SLAM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 11 figures, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sample Less, Learn More: Efficient Action Recognition via Frame Feature
+  Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14866v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14866v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harry Cheng, Yangyang Guo, Liqiang Nie, Zhiyong Cheng, Mohan Kankanhalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training an effective video action recognition model poses significant
+computational challenges, particularly under limited resource budgets. Current
+methods primarily aim to either reduce model size or utilize pre-trained
+models, limiting their adaptability to various backbone architectures. This
+paper investigates the issue of over-sampled frames, a prevalent problem in
+many approaches yet it has received relatively little attention. Despite the
+use of fewer frames being a potential solution, this approach often results in
+a substantial decline in performance. To address this issue, we propose a novel
+method to restore the intermediate features for two sparsely sampled and
+adjacent video frames. This feature restoration technique brings a negligible
+increase in computational requirements compared to resource-intensive image
+encoders, such as ViT. To evaluate the effectiveness of our method, we conduct
+extensive experiments on four public datasets, including Kinetics-400,
+ActivityNet, UCF-101, and HMDB-51. With the integration of our method, the
+efficiency of three commonly used baselines has been improved by over 50%, with
+a mere 0.5% reduction in recognition accuracy. In addition, our method also
+surprisingly helps improve the generalization ability of the models under
+zero-shot settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages. Code and pretrained weight will be released at
+  https://github.com/xaCheng1996/SLLM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emotion4MIDI: a Lyrics-based Emotion-Labeled Symbolic Music <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serkan Sulun, Pedro Oliveira, Paula Viana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new large-scale emotion-labeled symbolic music dataset
+consisting of 12k MIDI songs. To create this dataset, we first trained emotion
+classification models on the GoEmotions dataset, achieving state-of-the-art
+results with a model half the size of the baseline. We then applied these
+models to lyrics from two large-scale MIDI datasets. Our dataset covers a wide
+range of fine-grained emotions, providing a valuable resource to explore the
+connection between music and emotions and, especially, to develop models that
+can generate music based on specific emotions. Our code for inference, trained
+models, and datasets are available online.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 22nd EPIA Conference on Artificial Intelligence (2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cascaded Cross-Modal <span class="highlight-title">Transformer</span> for Request and Complaint Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolae-Catalin Ristea, Radu Tudor Ionescu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel cascaded cross-modal transformer (CCMT) that combines
+speech and text transcripts to detect customer requests and complaints in phone
+conversations. Our approach leverages a multimodal paradigm by transcribing the
+speech using automatic speech recognition (ASR) models and translating the
+transcripts into different languages. Subsequently, we combine
+language-specific BERT-based models with Wav2Vec2.0 audio features in a novel
+cascaded cross-attention transformer model. We apply our system to the Requests
+Sub-Challenge of the ACM Multimedia 2023 Computational Paralinguistics
+Challenge, reaching unweighted average recalls (UAR) of 65.41% and 85.87% for
+the complaint and request classes, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dance2MIDI: Dance-driven multi-instruments music generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.09080v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.09080v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Han, Yi Ren, Yuheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dance-driven music generation aims to generate musical pieces conditioned on
+dance videos. Previous works focus on monophonic or raw audio generation, while
+the multiinstruments scenario is under-explored. The challenges of the
+dance-driven multi-instruments music (MIDI) generation are two-fold: 1) no
+publicly available multi-instruments MIDI and video paired dataset and 2) the
+weak correlation between music and video. To tackle these challenges, we build
+the first multi-instruments MIDI and dance paired dataset (D2MIDI). Based on
+our proposed dataset, we introduce a multi-instruments MIDI generation
+framework (Dance2MIDI) conditioned on dance video. Specifically, 1) to model
+the correlation between music and dance, we encode the dance motion using the
+GCN, and 2) to generate harmonious and coherent music, we employ Transformer to
+decode the MIDI sequence. We evaluate the generated music of our framework
+trained on D2MIDI dataset and demonstrate that our method outperforms existing
+methods. The data and code are available on the GitHub website.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The reason for the withdrawal and retraction is due to recent
+  developments regarding the research presented in the manuscript. After
+  further investigation and reassessment, I have identified crucial issues with
+  the methodology and data used in the study. These concerns have raised doubts
+  about the accuracy and reliability of the findings presented in the
+  manuscript</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-07-26T00:00:00Z">2023-07-26</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">45</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generalist Biomedical AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14334v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14334v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Tu, Shekoofeh Azizi, Danny Driess, Mike Schaekermann, Mohamed Amin, Pi-Chuan Chang, Andrew Carroll, Chuck Lau, Ryutaro Tanno, Ira Ktena, Basil Mustafa, Aakanksha Chowdhery, Yun Liu, Simon Kornblith, David Fleet, Philip Mansfield, Sushant Prakash, Renee Wong, Sunny Virmani, Christopher Semturs, S Sara Mahdavi, Bradley Green, Ewa Dominowska, Blaise Aguera y Arcas, Joelle Barral, Dale Webster, Greg S. Corrado, Yossi Matias, Karan Singhal, Pete Florence, Alan Karthikesalingam, Vivek Natarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medicine is inherently multimodal, with rich data modalities spanning text,
+imaging, genomics, and more. Generalist biomedical artificial intelligence (AI)
+systems that flexibly encode, integrate, and interpret this data at scale can
+potentially enable impactful applications ranging from scientific discovery to
+care delivery. To enable the development of these models, we first curate
+MultiMedBench, a new multimodal biomedical benchmark. MultiMedBench encompasses
+14 diverse tasks such as medical question answering, mammography and
+dermatology image interpretation, radiology report generation and
+summarization, and genomic variant calling. We then introduce Med-PaLM
+Multimodal (Med-PaLM M), our proof of concept for a generalist biomedical AI
+system. Med-PaLM M is a large multimodal generative model that flexibly encodes
+and interprets biomedical data including clinical language, imaging, and
+genomics with the same set of model weights. Med-PaLM M reaches performance
+competitive with or exceeding the state of the art on all MultiMedBench tasks,
+often surpassing specialist models by a wide margin. We also report examples of
+zero-shot generalization to novel medical concepts and tasks, positive transfer
+learning across tasks, and emergent zero-shot medical reasoning. To further
+probe the capabilities and limitations of Med-PaLM M, we conduct a radiologist
+evaluation of model-generated (and human) chest X-ray reports and observe
+encouraging performance across model scales. In a side-by-side ranking on 246
+retrospective chest X-rays, clinicians express a pairwise preference for
+Med-PaLM M reports over those produced by radiologists in up to 40.50% of
+cases, suggesting potential clinical utility. While considerable work is needed
+to validate these models in real-world use cases, our results represent a
+milestone towards the development of generalist biomedical AI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the Moral Beliefs Encoded in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nino Scherrer, Claudia Shi, Amir Feder, David M. Blei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a case study on the design, administration,
+post-processing, and evaluation of surveys on large language models (LLMs). It
+comprises two components: (1) A statistical method for eliciting beliefs
+encoded in LLMs. We introduce statistical measures and evaluation metrics that
+quantify the probability of an LLM "making a choice", the associated
+uncertainty, and the consistency of that choice. (2) We apply this method to
+study what moral beliefs are encoded in different LLMs, especially in ambiguous
+cases where the right choice is not obvious. We design a large-scale survey
+comprising 680 high-ambiguity moral scenarios (e.g., "Should I tell a white
+lie?") and 687 low-ambiguity moral scenarios (e.g., "Should I stop for a
+pedestrian on the road?"). Each scenario includes a description, two possible
+actions, and auxiliary labels indicating violated rules (e.g., "do not kill").
+We administer the survey to 28 open- and closed-source LLMs. We find that (a)
+in unambiguous scenarios, most models "choose" actions that align with
+commonsense. In ambiguous cases, most models express uncertainty. (b) Some
+models are uncertain about choosing the commonsense action because their
+responses are sensitive to the question-wording. (c) Some models reflect clear
+preferences in ambiguous scenarios. Specifically, closed-source models tend to
+agree with each other.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Analysis of Libraries for the Sentimental Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wendy Ccoya, Edson Pinto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study is main goal is to provide a comparative comparison of libraries
+using machine learning methods. Experts in natural language processing (NLP)
+are becoming more and more interested in sentiment analysis (SA) of text
+changes. The objective of employing NLP text analysis techniques is to
+recognize and categorize feelings related to twitter users utterances. In this
+examination, issues with SA and the libraries utilized are also looked at.
+provides a number of cooperative methods to classify emotional polarity. The
+Naive Bayes Classifier, Decision Tree Classifier, Maxent Classifier, Sklearn
+Classifier, Sklearn Classifier MultinomialNB, and other conjoint learning
+algorithms, according to recent research, are very effective. In the project
+will use Five Python and R libraries NLTK, TextBlob, Vader, Transformers (GPT
+and BERT pretrained), and Tidytext will be used in the study to apply sentiment
+analysis techniques. Four machine learning models Tree of Decisions (DT),
+Support Vector Machine (SVM), Naive Bayes (NB), and K-Nearest Neighbor (KNN)
+will also be used. To evaluate how well libraries for SA operate in the social
+network environment, comparative study was also carried out. The measures to
+assess the best algorithms in this experiment, which used a single data set for
+each method, were precision, recall, and F1 score. We conclude that the BERT
+transformer method with an Accuracy: 0.973 is recommended for sentiment
+analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatically Evaluating Opinion Prevalence in Opinion Summarization <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14305v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14305v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Malon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When faced with a large number of product reviews, it is not clear that a
+human can remember all of them and weight opinions representatively to write a
+good reference summary. We propose an automatic metric to test the prevalence
+of the opinions that a summary expresses, based on counting the number of
+reviews that are consistent with each statement in the summary, while
+discrediting trivial or redundant statements. To formulate this opinion
+prevalence metric, we consider several existing methods to score the factual
+consistency of a summary statement with respect to each individual source
+review. On a corpus of Amazon product reviews, we gather multiple human
+judgments of the opinion consistency, to determine which automatic metric best
+expresses consistency in product reviews. Using the resulting opinion
+prevalence metric, we show that a human authored summary has only slightly
+better opinion prevalence than randomly selected extracts from the source
+reviews, and previous extractive and abstractive unsupervised opinion
+summarization methods perform worse than humans. We demonstrate room for
+improvement with a greedy construction of extractive summaries with twice the
+opinion prevalence achieved by humans. Finally, we show that preprocessing
+source reviews by simplification can raise the opinion prevalence achieved by
+existing abstractive opinion summarization systems to the level of human
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 6th Workshop on e-Commerce and NLP (KDD 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chat<span class="highlight-title">GPT</span> and Persuasive Technologies for the Management and Delivery of
+  Personalized Recommendations in Hotel Hospitality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manolis Remountakis, Konstantinos Kotis, Babis Kourtzis, George E. Tsekouras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems have become indispensable tools in the hotel hospitality
+industry, enabling personalized and tailored experiences for guests. Recent
+advancements in large language models (LLMs), such as ChatGPT, and persuasive
+technologies, have opened new avenues for enhancing the effectiveness of those
+systems. This paper explores the potential of integrating ChatGPT and
+persuasive technologies for automating and improving hotel hospitality
+recommender systems. First, we delve into the capabilities of ChatGPT, which
+can understand and generate human-like text, enabling more accurate and
+context-aware recommendations. We discuss the integration of ChatGPT into
+recommender systems, highlighting the ability to analyze user preferences,
+extract valuable insights from online reviews, and generate personalized
+recommendations based on guest profiles. Second, we investigate the role of
+persuasive technology in influencing user behavior and enhancing the persuasive
+impact of hotel recommendations. By incorporating persuasive techniques, such
+as social proof, scarcity and personalization, recommender systems can
+effectively influence user decision-making and encourage desired actions, such
+as booking a specific hotel or upgrading their room. To investigate the
+efficacy of ChatGPT and persuasive technologies, we present a pilot experi-ment
+with a case study involving a hotel recommender system. We aim to study the
+impact of integrating ChatGPT and persua-sive techniques on user engagement,
+satisfaction, and conversion rates. The preliminary results demonstrate the
+potential of these technologies in enhancing the overall guest experience and
+business performance. Overall, this paper contributes to the field of hotel
+hospitality by exploring the synergistic relationship between LLMs and
+persuasive technology in recommender systems, ultimately influencing guest
+satisfaction and hotel revenue.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Founding a mathematical diffusion model in linguistics. The case study
+  of German syntactic features in the North-Eastern Italian dialects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        I. Lazzizzera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We take as a case study the spread of Germanic syntactic features into
+Romance dialects of North-Eastern Italy, which occurred after the immigration
+of German people in the Tyrol during the High Middle Ages.
+  An interactive map is produced using tools of what is called Geographic Data
+Science. A smooth two-dimensional surface $\mathcal{G}$ expresses locally which
+fraction of territory uses a given German language feature: it is obtained by
+interpolating a discrete function that says if at any surveyed locality that
+feature is used or not.\newline
+  This surface $\mathcal{G}$ is thought of as the value at the present time of
+a function describing a diffusion-convection phenomenon in two dimensions (here
+said \emph{tidal} mode), which is subjected in a very natural way to the same
+equation, suitably contextualized, used in physics for a number of
+phenomenological facts like the heat diffusion. It is shown that solutions of
+this equation, evaluated at the present time, fit well with the data as
+interpolated by $\mathcal{G}$, thus providing convincing pictures of
+diffusion-convection of the linguistic features of the case study, albeit
+simplifications and approximations.\newline
+  Very importantly, it is shown that Schmidt's 'waves' can be counted among the
+solutions of the diffusion equation: superimposing Schmidt 'waves' to a 'tidal
+flooding' can reproduce complexities of real linguistic diffusion events.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UnScientify: Detecting Scientific Uncertainty in Scholarly Full Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Panggih Kusuma Ningrum, Philipp Mayr, Iana Atanassova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This demo paper presents UnScientify, an interactive system designed to
+detect scientific uncertainty in scholarly full text. The system utilizes a
+weakly supervised technique that employs a fine-grained annotation scheme to
+identify verbally formulated uncertainty at the sentence level in scientific
+texts. The pipeline for the system includes a combination of pattern matching,
+complex sentence checking, and authorial reference checking. Our approach
+automates labeling and annotation tasks for scientific uncertainty
+identification, taking into account different types of scientific uncertainty,
+that can serve various applications such as information retrieval, text mining,
+and scholarly document processing. Additionally, UnScientify provides
+interpretable results, aiding in the comprehension of identified instances of
+scientific uncertainty in text.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted for the Joint Workshop of the 4th Extraction and
+  Evaluation of Knowledge Entities from Scientific Documents and the 3rd AI +
+  Informetrics (EEKE-AII2023), June 26, 2023, Santa Fe, New Mexico, USA and
+  Online</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LOIS: Looking Out of Instance Semantics for Visual Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Zhang, Yeming Chen, Yaoru Sun, Fang Wang, Haibo Shi, Haoran Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual question answering (VQA) has been intensively studied as a multimodal
+task that requires effort in bridging vision and language to infer answers
+correctly. Recent attempts have developed various attention-based modules for
+solving VQA tasks. However, the performance of model inference is largely
+bottlenecked by visual processing for semantics understanding. Most existing
+detection methods rely on bounding boxes, remaining a serious challenge for VQA
+models to understand the causal nexus of object semantics in images and
+correctly infer contextual information. To this end, we propose a finer model
+framework without bounding boxes in this work, termed Looking Out of Instance
+Semantics (LOIS) to tackle this important issue. LOIS enables more fine-grained
+feature descriptions to produce visual facts. Furthermore, to overcome the
+label ambiguity caused by instance masks, two types of relation attention
+modules: 1) intra-modality and 2) inter-modality, are devised to infer the
+correct answers from the different multi-view features. Specifically, we
+implement a mutual relation attention module to model sophisticated and deeper
+visual semantic relations between instance objects and background information.
+In addition, our proposed attention model can further analyze salient image
+regions by focusing on important word-related questions. Experimental results
+on four benchmark VQA datasets prove that our proposed method has favorable
+performance in improving visual reasoning capability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Developing and Evaluating Tiny to Medium-Sized Turkish <span class="highlight-title">BERT</span> Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Himmet Toprak Kesgin, Muzaffer Kaan Yuce, Mehmet Fatih Amasyali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces and evaluates tiny, mini, small, and medium-sized
+uncased Turkish BERT models, aiming to bridge the research gap in
+less-resourced languages. We trained these models on a diverse dataset
+encompassing over 75GB of text from multiple sources and tested them on several
+tasks, including mask prediction, sentiment analysis, news classification, and,
+zero-shot classification. Despite their smaller size, our models exhibited
+robust performance, including zero-shot task, while ensuring computational
+efficiency and faster execution times. Our findings provide valuable insights
+into the development and application of smaller language models, especially in
+the context of the Turkish language.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Say Goodbye to RNN-T Loss: A Novel CIF-based Transducer Architecture for
+  Automatic Speech Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tian-Hao Zhang, Dinghao Zhou, Guiping Zhon, Baoxiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  RNN-T models are widely used in ASR, which rely on the RNN-T loss to achieve
+length alignment between input audio and target sequence. However, the
+implementation complexity and the alignment-based optimization target of RNN-T
+loss lead to computational redundancy and a reduced role for predictor network,
+respectively. In this paper, we propose a novel model named CIF-Transducer
+(CIF-T) which incorporates the Continuous Integrate-and-Fire (CIF) mechanism
+with the RNN-T model to achieve efficient alignment. In this way, the RNN-T
+loss is abandoned, thus bringing a computational reduction and allowing the
+predictor network a more significant role. We also introduce Funnel-CIF,
+Context Blocks, Unified Gating and Bilinear Pooling joint network, and
+auxiliary training strategy to further improve performance. Experiments on the
+178-hour AISHELL-1 and 10000-hour WenetSpeech datasets show that CIF-T achieves
+state-of-the-art results with lower computational overhead compared to RNN-T
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Implicit Feedback from Deployment Data in Dialogue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14117v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14117v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Yuanzhe Pang, Stephen Roller, Kyunghyun Cho, He He, Jason Weston
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study improving social conversational agents by learning from natural
+dialogue between users and a deployed model, without extra annotations. To
+implicitly measure the quality of a machine-generated utterance, we leverage
+signals like user response length, sentiment and reaction of the future human
+utterances in the collected dialogue episodes. Our experiments use the publicly
+released deployment data from BlenderBot (Xu et al., 2023). Human evaluation
+indicates improvements in our new models over baseline responses; however, we
+find that some proxy signals can lead to more generations with undesirable
+properties as well. For example, optimizing for conversation length can lead to
+more controversial or unfriendly generations compared to the baseline, whereas
+optimizing for positive sentiment or reaction can decrease these behaviors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decoding Chat<span class="highlight-title">GPT</span>: A Taxonomy of Existing Research, Current Challenges,
+  and Possible Future Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14107v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14107v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahab Saquib Sohail, Faiza Farhat, Yassine Himeur, Mohammad Nadeem, Dag Øivind Madsen, Yashbir Singh, Shadi Atalla, Wathiq Mansoor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chat Generative Pre-trained Transformer (ChatGPT) has gained significant
+interest and attention since its launch in November 2022. It has shown
+impressive performance in various domains, including passing exams and creative
+writing. However, challenges and concerns related to biases and trust persist.
+In this work, we present a comprehensive review of over 100 Scopus-indexed
+publications on ChatGPT, aiming to provide a taxonomy of ChatGPT research and
+explore its applications. We critically analyze the existing literature,
+identifying common approaches employed in the studies. Additionally, we
+investigate diverse application areas where ChatGPT has found utility, such as
+healthcare, marketing and financial services, software engineering, academic
+and scientific writing, research and education, environmental science, and
+natural language processing. Through examining these applications, we gain
+valuable insights into the potential of ChatGPT in addressing real-world
+challenges. We also discuss crucial issues related to ChatGPT, including biases
+and trustworthiness, emphasizing the need for further research and development
+in these areas. Furthermore, we identify potential future directions for
+ChatGPT research, proposing solutions to current challenges and speculating on
+expected advancements. By fully leveraging the capabilities of ChatGPT, we can
+unlock its potential across various domains, leading to advancements in
+conversational AI and transformative impacts in society.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages. 8 figures and 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi3WOZ: A Multilingual, Multi-Domain, Multi-Parallel <span class="highlight-title">Dataset</span> for
+  Training and Evaluating Culturally Adapted Task-Oriented Dialog Systems <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songbo Hu, Han Zhou, Mete Hergul, Milan Gritta, Guchun Zhang, Ignacio Iacobacci, Ivan Vulić, Anna Korhonen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creating high-quality annotated data for task-oriented dialog (ToD) is known
+to be notoriously difficult, and the challenges are amplified when the goal is
+to create equitable, culturally adapted, and large-scale ToD datasets for
+multiple languages. Therefore, the current datasets are still very scarce and
+suffer from limitations such as translation-based non-native dialogs with
+translation artefacts, small scale, or lack of cultural adaptation, among
+others. In this work, we first take stock of the current landscape of
+multilingual ToD datasets, offering a systematic overview of their properties
+and limitations. Aiming to reduce all the detected limitations, we then
+introduce Multi3WOZ, a novel multilingual, multi-domain, multi-parallel ToD
+dataset. It is large-scale and offers culturally adapted dialogs in 4 languages
+to enable training and evaluation of multilingual and cross-lingual ToD
+systems. We describe a complex bottom-up data collection process that yielded
+the final dataset, and offer the first sets of baseline scores across different
+ToD-related tasks for future reference, also highlighting its challenging
+nature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A pre-MIT Press publication version for TACL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised extraction of local and global keywords from a single text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14005v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14005v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lida Aleksanyan, Armen E. Allahverdyan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an unsupervised, corpus-independent method to extract keywords
+from a single text. It is based on the spatial distribution of words and the
+response of this distribution to a random permutation of words. As compared to
+existing methods (such as e.g. YAKE) our method has three advantages. First, it
+is significantly more effective at extracting keywords from long texts. Second,
+it allows inference of two types of keywords: local and global. Third, it
+uncovers basic themes in texts. Additionally, our method is
+language-independent and applies to short texts. The results are obtained via
+human annotators with previous knowledge of texts from our database of
+classical literary works (the agreement between annotators is from moderate to
+substantial). Our results are supported via human-independent arguments based
+on the average length of extracted content words and on the average number of
+nouns in extracted words. We discuss relations of keywords with higher-order
+textual features and reveal a connection between keywords and chapter
+divisions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Affective Natural Language Generation of Event Descriptions through
+  Fine-grained Appraisal Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14004v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14004v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yarik Menchaca Resendiz, Roman Klinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models for affective text generation have shown a remarkable progress, but
+they commonly rely only on basic emotion theories or valance/arousal values as
+conditions. This is appropriate when the goal is to create explicit emotion
+statements ("The kid is happy."). Emotions are, however, commonly communicated
+implicitly. For instance, the emotional interpretation of an event ("Their dog
+died.") does often not require an explicit emotion statement. In psychology,
+appraisal theories explain the link between a cognitive evaluation of an event
+and the potentially developed emotion. They put the assessment of the situation
+on the spot, for instance regarding the own control or the responsibility for
+what happens. We hypothesize and subsequently show that including appraisal
+variables as conditions in a generation framework comes with two advantages.
+(1) The generation model is informed in greater detail about what makes a
+specific emotion and what properties it has. This leads to text generation that
+better fulfills the condition. (2) The variables of appraisal allow a user to
+perform a more fine-grained control of the generated text, by stating
+properties of a situation instead of only providing the emotion category. Our
+Bart and T5-based experiments with 7 emotions (Anger, Disgust, Fear, Guilt,
+Joy, Sadness, Shame), and 7 appraisals (Attention, Responsibility, Control,
+Circumstance, Pleasantness, Effort, Certainty) show that (1) adding appraisals
+during training improves the accurateness of the generated texts by 10 pp in
+F1. Further, (2) the texts with appraisal variables are longer and contain more
+details. This exemplifies the greater control for users.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INLG 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ This is not correct! Negation-aware Evaluation of Language Generation
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13989v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13989v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miriam Anschütz, Diego Miguel Lozano, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models underestimate the impact of negations on how much they
+change the meaning of a sentence. Therefore, learned evaluation metrics based
+on these models are insensitive to negations. In this paper, we propose
+NegBLEURT, a negation-aware version of the BLEURT evaluation metric. For that,
+we designed a rule-based sentence negation tool and used it to create the
+CANNOT negation evaluation dataset. Based on this dataset, we fine-tuned a
+sentence transformer and an evaluation metric to improve their negation
+sensitivity. Evaluating these models on existing benchmarks shows that our
+fine-tuned models outperform existing metrics on the negated sentences by far
+while preserving their base models' performances on other perturbations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INLG 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Does Diffusion Influence <span class="highlight-title">Pretrain</span>ed Language Models on
+  Out-of-Distribution Data? <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huazheng Wang, Daixuan Cheng, Haifeng Sun, Jingyu Wang, Qi Qi, Jianxin Liao, Jing Wang, Cong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based pretrained language models (PLMs) have achieved great
+success in modern NLP. An important advantage of PLMs is good
+out-of-distribution (OOD) robustness. Recently, diffusion models have attracted
+a lot of work to apply diffusion to PLMs. It remains under-explored how
+diffusion influences PLMs on OOD data. The core of diffusion models is a
+forward diffusion process which gradually applies Gaussian noise to inputs, and
+a reverse denoising process which removes noise. The noised input
+reconstruction is a fundamental ability of diffusion models. We directly
+analyze OOD robustness by measuring the reconstruction loss, including testing
+the abilities to reconstruct OOD data, and to detect OOD samples. Experiments
+are conducted by analyzing different training parameters and data statistical
+features on eight datasets. It shows that finetuning PLMs with diffusion
+degrades the reconstruction ability on OOD data. The comparison also shows that
+diffusion models can effectively detect OOD samples, achieving state-of-the-art
+performance in most of the datasets with an absolute accuracy improvement up to
+18%. These results indicate that diffusion reduces OOD robustness of PLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grammar<span class="highlight-title">GPT</span>: Exploring Open-Source LLMs for Native Chinese Grammatical
+  Error Correction with Supervised Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaxin Fan, Feng Jiang, Peifeng Li, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Grammatical error correction aims to correct ungrammatical sentences
+automatically. Recently, some work has demonstrated the excellent capabilities
+of closed-source Large Language Models (LLMs, e.g., ChatGPT) in grammatical
+error correction. However, the potential of open-source LLMs remains
+unexplored. In this paper, we introduced GrammarGPT, an open-source LLM, to
+preliminary explore its potential for native Chinese grammatical error
+correction. The core recipe of GrammarGPT is to leverage the hybrid dataset of
+ChatGPT-generated and human-annotated. For grammatical errors with clues, we
+proposed a heuristic method to guide ChatGPT to generate ungrammatical
+sentences by providing those clues. For grammatical errors without clues, we
+collected ungrammatical sentences from publicly available websites and manually
+corrected them. In addition, we employed an error-invariant augmentation method
+to enhance the ability of the model to correct native Chinese grammatical
+errors. We ultimately constructed about 1k parallel data and utilized these
+data to fine-tune open-source LLMs (e.g., Phoenix, released by The Chinese
+University of Hong Kong, Shenzhen) with instruction tuning. The experimental
+results show that GrammarGPT outperforms the existing SOTA system
+significantly. Although model parameters are 20x larger than the SOTA baseline,
+the required amount of data for instruction tuning is 1200x smaller,
+illustrating the potential of open-source LLMs on native CGEC. Our GrammarGPT
+ranks $3^{rd}$ on NLPCC2023 SharedTask1, demonstrating our approach's
+effectiveness. The code and data are available at
+\url{https://github.com/FreedomIntelligence/GrammarGPT}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FinTree: Financial <span class="highlight-title">Dataset</span> <span class="highlight-title">Pretrain</span> <span class="highlight-title">Transformer</span> Encoder for Relation
+  Extraction <span class="chip">SIGIR'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyunjong Ok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present FinTree, Financial Dataset Pretrain Transformer Encoder for
+Relation Extraction. Utilizing an encoder language model, we further pretrain
+FinTree on the financial dataset, adapting the model in financial domain tasks.
+FinTree stands out with its novel structure that predicts a masked token
+instead of the conventional [CLS] token, inspired by the Pattern Exploiting
+Training methodology. This structure allows for more accurate relation
+predictions between two given entities. The model is trained with a unique
+input pattern to provide contextual and positional information about the
+entities of interest, and a post-processing step ensures accurate predictions
+in line with the entity types. Our experiments demonstrate that FinTree
+outperforms on the REFinD, a large-scale financial relation extraction dataset.
+The code and pretrained models are available at
+https://github.com/HJ-Ok/FinTree.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4pages, 2 figures, The SIGIR'23 Workshop on Knowledge Discovery from
+  Unstructured Data in Financial Services</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Speed Reading Tool Powered by Artificial Intelligence for Students with
+  ADHD, Dyslexia, or Short Attention Span 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14544v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14544v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Megat Irfan Zackry Bin Ismail Ahmad Nazran bin Yusri Muhammad Hafizzul Bin Abdul Manap Muhammad Muizzuddin Bin Kamarozaman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach to assist students with dyslexia, ADHD,
+and short attention span in digesting any text-based information more
+efficiently. The proposed solution utilizes the Multilayer Perceptron (MLP)
+algorithm for complex text processing and summarization tasks. The tool
+leverages the T5 (Text-to-Text Transfer Transformer) model from Hugging Face,
+which treats every NLP task as a text generation task. The model is fine-tuned
+on specific tasks using a smaller dataset. The NLTK's Punkt Sentence Tokenizer
+is used to divide a text into a list of sentences. The application is served
+using Flask, a lightweight web server and framework. The tool also applies
+principles from Bionic Reading to enhance readability, which includes a bolding
+function and adjustments to line, word, and character spacing. The paper
+discusses the methodology, implementation, and results of the AI-based speed
+reading tool.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Plug and Pray: Exploiting off-the-shelf components of Multi-Modal Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14539v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14539v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erfan Shayegani, Yue Dong, Nael Abu-Ghazaleh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid growth and increasing popularity of incorporating additional
+modalities (e.g., vision) into large language models (LLMs) has raised
+significant security concerns. This expansion of modality, akin to adding more
+doors to a house, unintentionally creates multiple access points for
+adversarial attacks. In this paper, by introducing adversarial embedding space
+attacks, we emphasize the vulnerabilities present in multi-modal systems that
+originate from incorporating off-the-shelf components like public pre-trained
+encoders in a plug-and-play manner into these systems. In contrast to existing
+work, our approach does not require access to the multi-modal system's weights
+or parameters but instead relies on the huge under-explored embedding space of
+such pre-trained encoders. Our proposed embedding space attacks involve seeking
+input images that reside within the dangerous or targeted regions of the
+extensive embedding space of these pre-trained components. These crafted
+adversarial images pose two major threats: 'Context Contamination' and 'Hidden
+Prompt Injection'-both of which can compromise multi-modal models like LLaVA
+and fully change the behavior of the associated language model. Our findings
+emphasize the need for a comprehensive examination of the underlying
+components, particularly pre-trained encoders, before incorporating them into
+systems in a plug-and-play manner to ensure robust security.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CliniDigest: A Case Study in Large Language Model Based Large-Scale
+  Summarization of Clinical Trial Descriptions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14522v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14522v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renee D. White, Tristan Peng, Pann Sripitak, Alexander Rosenberg Johansen, Michael Snyder, Stanford University
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A clinical trial is a study that evaluates new biomedical interventions. To
+design new trials, researchers draw inspiration from those current and
+completed. In 2022, there were on average more than 100 clinical trials
+submitted to ClinicalTrials.gov every day, with each trial having a mean of
+approximately 1500 words [1]. This makes it nearly impossible to keep up to
+date. To mitigate this issue, we have created a batch clinical trial summarizer
+called CliniDigest using GPT-3.5. CliniDigest is, to our knowledge, the first
+tool able to provide real-time, truthful, and comprehensive summaries of
+clinical trials. CliniDigest can reduce up to 85 clinical trial descriptions
+(approximately 10,500 words) into a concise 200-word summary with references
+and limited hallucinations. We have tested CliniDigest on its ability to
+summarize 457 trials divided across 27 medical subdomains. For each field,
+CliniDigest generates summaries of $\mu=153,\ \sigma=69 $ words, each of which
+utilizes $\mu=54\%,\ \sigma=30\% $ of the sources. A more comprehensive
+evaluation is planned and outlined in this paper.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures, 3 tables, conference: ACM GoodIt 23'</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Words That Stick: Predicting Decision Making and Synonym Engagement
+  Using Cognitive Biases and Computational Linguistics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nimrod Dvir, Elaine Friedman, Suraj Commuri, Fan Yang, Jennifer Romano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research draws upon cognitive psychology and information systems studies
+to anticipate user engagement and decision-making on digital platforms. By
+employing natural language processing (NLP) techniques and insights from
+cognitive bias research, we delve into user interactions with synonyms within
+digital content. Our methodology synthesizes four cognitive
+biasesRepresentativeness, Ease-of-use, Affect, and Distributioninto the READ
+model. Through a comprehensive user survey, we assess the model's ability to
+predict user engagement, discovering that synonyms that accurately represent
+core ideas, are easy to understand, elicit emotional responses, and are
+commonly encountered, promote greater user engagement. Crucially, our work
+offers a fresh lens on human-computer interaction, digital behaviors, and
+decision-making processes. Our results highlight the promise of cognitive
+biases as potent indicators of user engagement, underscoring their significance
+in designing effective digital content across fields like education and
+marketing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Predictive Model of Digital Information Engagement: Forecasting User
+  Engagement With English Words by Incorporating Cognitive Biases,
+  Computational Linguistics and Natural Language Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14500v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14500v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nimrod Dvir, Elaine Friedman, Suraj Commuri, Fan yang, Jennifer Romano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces and empirically tests a novel predictive model for
+digital information engagement (IE) - the READ model, an acronym for the four
+pivotal attributes of engaging information: Representativeness, Ease-of-use,
+Affect, and Distribution. Conceptualized within the theoretical framework of
+Cumulative Prospect Theory, the model integrates key cognitive biases with
+computational linguistics and natural language processing to develop a
+multidimensional perspective on information engagement. A rigorous testing
+protocol was implemented, involving 50 randomly selected pairs of synonymous
+words (100 words in total) from the WordNet database. These words' engagement
+levels were evaluated through a large-scale online survey (n = 80,500) to
+derive empirical IE metrics. The READ attributes for each word were then
+computed and their predictive efficacy examined. The findings affirm the READ
+model's robustness, accurately predicting a word's IE level and distinguishing
+the more engaging word from a pair of synonyms with an 84% accuracy rate. The
+READ model's potential extends across various domains, including business,
+education, government, and healthcare, where it could enhance content
+engagement and inform AI language model development and generative text work.
+Future research should address the model's scalability and adaptability across
+different domains and languages, thereby broadening its applicability and
+efficacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Controllable Generation of Dialogue Acts for Dialogue Systems via
+  Few-Shot Response Generation and Ranking <span class="chip">SIGDIAL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14440v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14440v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Angela Ramirez, Karik Agarwal, Juraj Juraska, Utkarsh Garg, Marilyn A. Walker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dialogue systems need to produce responses that realize multiple types of
+dialogue acts (DAs) with high semantic fidelity. In the past, natural language
+generators (NLGs) for dialogue were trained on large parallel corpora that map
+from a domain-specific DA and its semantic attributes to an output utterance.
+Recent work shows that pretrained language models (LLMs) offer new
+possibilities for controllable NLG using prompt-based learning. Here we develop
+a novel few-shot overgenerate-and-rank approach that achieves the controlled
+generation of DAs. We compare eight few-shot prompt styles that include a novel
+method of generating from textual pseudo-references using a textual style
+transfer approach. We develop six automatic ranking functions that identify
+outputs with both the correct DA and high semantic accuracy at generation time.
+We test our approach on three domains and four LLMs. To our knowledge, this is
+the first work on NLG for dialogue that automatically ranks outputs using both
+DA and attribute accuracy. For completeness, we compare our results to
+fine-tuned few-shot models trained with 5 to 100 instances per DA. Our results
+show that several prompt settings achieve perfect DA accuracy, and near perfect
+semantic accuracy (99.81%) and perform better than few-shot fine-tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To Appear in SIGDIAL 2023. Proceedings of the 24th Annual Meeting of
+  the Special Interest Group on Discourse and Dialogue. 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Skill-it! A Data-Driven Skills Framework for Understanding and Training
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mayee F. Chen, Nicholas Roberts, Kush Bhatia, Jue Wang, Ce Zhang, Frederic Sala, Christopher Ré
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The quality of training data impacts the performance of pre-trained large
+language models (LMs). Given a fixed budget of tokens, we study how to best
+select data that leads to good downstream model performance across tasks. We
+develop a new framework based on a simple hypothesis: just as humans acquire
+interdependent skills in a deliberate order, language models also follow a
+natural order when learning a set of skills from their training data. If such
+an order exists, it can be utilized for improved understanding of LMs and for
+data-efficient training. Using this intuition, our framework formalizes the
+notion of a skill and of an ordered set of skills in terms of the associated
+data. First, using both synthetic and real data, we demonstrate that these
+ordered skill sets exist, and that their existence enables more advanced skills
+to be learned with less data when we train on their prerequisite skills.
+Second, using our proposed framework, we introduce an online data sampling
+algorithm, Skill-It, over mixtures of skills for both continual pre-training
+and fine-tuning regimes, where the objective is to efficiently learn multiple
+skills in the former and an individual skill in the latter. On the LEGO
+synthetic in the continual pre-training setting, Skill-It obtains 36.5 points
+higher accuracy than random sampling. On the Natural Instructions dataset in
+the fine-tuning setting, Skill-It reduces the validation loss on the target
+skill by 13.6% versus training on data associated with the target skill itself.
+We apply our skills framework on the recent RedPajama dataset to continually
+pre-train a 3B-parameter LM, achieving higher accuracy on the LM Evaluation
+Harness with 1B tokens than the baseline approach of sampling uniformly over
+data sources with 3B tokens.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diff-E: Diffusion-based Learning for Decoding Imagined Speech EEG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14389v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14389v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soowon Kim, Young-Eun Lee, Seo-Hyun Lee, Seong-Whan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decoding EEG signals for imagined speech is a challenging task due to the
+high-dimensional nature of the data and low signal-to-noise ratio. In recent
+years, denoising diffusion probabilistic models (DDPMs) have emerged as
+promising approaches for representation learning in various domains. Our study
+proposes a novel method for decoding EEG signals for imagined speech using
+DDPMs and a conditional autoencoder named Diff-E. Results indicate that Diff-E
+significantly improves the accuracy of decoding EEG signals for imagined speech
+compared to traditional machine learning techniques and baseline models. Our
+findings suggest that DDPMs can be an effective tool for EEG signal decoding,
+with potential implications for the development of brain-computer interfaces
+that enable communication through imagined speech.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Interspeech 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Large Language Models for Mental Health Prediction via Online
+  Text Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14385v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14385v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuhai Xu, Bingshen Yao, Yuanzhe Dong, Hong Yu, James Hendler, Anind K. Dey, Dakuo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent technology boost of large language models (LLMs) has empowered a
+variety of applications. However, there is very little research on
+understanding and improving LLMs' capability for the mental health domain. In
+this work, we present the first comprehensive evaluation of multiple LLMs,
+including Alpaca, Alpaca-LoRA, and GPT-3.5, on various mental health prediction
+tasks via online text data. We conduct a wide range of experiments, covering
+zero-shot prompting, few-shot prompting, and instruction finetuning. The
+results indicate the promising yet limited performance of LLMs with zero-shot
+and few-shot prompt designs for mental health tasks. More importantly, our
+experiments show that instruction finetuning can significantly boost the
+performance of LLMs for all tasks simultaneously. Our best-finetuned model,
+Mental-Alpaca, outperforms GPT-3.5 (25 times bigger) by 16.7\% on balanced
+accuracy and performs on par with the state-of-the-art task-specific model. We
+summarize our findings into a set of action guidelines for future researchers,
+engineers, and practitioners on how to empower LLMs with better mental health
+domain knowledge and become an expert in mental health prediction tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">GPT</span>-3 Models are Few-Shot Financial Reasoners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13617v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13617v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raul Salles de Padua, Imran Qureshi, Mustafa U. Karakaplan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Financial analysis is an important tool for evaluating company performance.
+Practitioners work to answer financial questions to make profitable investment
+decisions, and use advanced quantitative analyses to do so. As a result,
+Financial Question Answering (QA) is a question answering task that requires
+deep reasoning about numbers. Furthermore, it is unknown how well pre-trained
+language models can reason in the financial domain. The current
+state-of-the-art requires a retriever to collect relevant facts about the
+financial question from the text and a generator to produce a valid financial
+program and a final answer. However, recently large language models like GPT-3
+have achieved state-of-the-art performance on wide variety of tasks with just a
+few shot examples. We run several experiments with GPT-3 and find that a
+separate retrieval model and logic engine continue to be essential components
+to achieving SOTA performance in this task, particularly due to the precise
+nature of financial questions and the complex information stored in financial
+documents. With this understanding, our refined prompt-engineering approach on
+GPT-3 achieves near SOTA accuracy without any fine-tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMBench: Is Your Multi-modal Model an All-around Player? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06281v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06281v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Liu, Haodong Duan, Yuanhan Zhang, Bo Li, Songyang Zhang, Wangbo Zhao, Yike Yuan, Jiaqi Wang, Conghui He, Ziwei Liu, Kai Chen, Dahua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models have recently achieved remarkable progress,
+exhibiting great perception and reasoning abilities concerning visual
+information. However, how to effectively evaluate these large vision-language
+models remains a major obstacle, hindering future model development.
+Traditional benchmarks like VQAv2 or COCO Caption provide quantitative
+performance measurements but suffer from a lack of fine-grained ability
+assessment and non-robust evaluation metrics. Recent subjective benchmarks,
+such as OwlEval, offer comprehensive evaluations of a model's abilities by
+incorporating human labor, but they are not scalable and display significant
+bias. In response to these challenges, we propose MMBench, a novel
+multi-modality benchmark. MMBench methodically develops a comprehensive
+evaluation pipeline, primarily comprised of two elements. The first element is
+a meticulously curated dataset that surpasses existing similar benchmarks in
+terms of the number and variety of evaluation questions and abilities. The
+second element introduces a novel CircularEval strategy and incorporates the
+use of ChatGPT. This implementation is designed to convert free-form
+predictions into pre-defined choices, thereby facilitating a more robust
+evaluation of the model's predictions. MMBench is a systematically-designed
+objective benchmark for robustly evaluating the various abilities of
+vision-language models. We hope MMBench will assist the research community in
+better evaluating their models and encourage future advancements in this
+domain. Project page: https://opencompass.org.cn/mmbench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Multi-Modal Representations for Ambiguity Detection &
+  Coreference Resolution in the SIMMC 2.0 Challenge <span class="chip">AAAI 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.12645v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.12645v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Chiyah-Garcia, Alessandro Suglia, José Lopes, Arash Eshghi, Helen Hastie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anaphoric expressions, such as pronouns and referential descriptions, are
+situated with respect to the linguistic context of prior turns, as well as, the
+immediate visual environment. However, a speaker's referential descriptions do
+not always uniquely identify the referent, leading to ambiguities in need of
+resolution through subsequent clarificational exchanges. Thus, effective
+Ambiguity Detection and Coreference Resolution are key to task success in
+Conversational AI. In this paper, we present models for these two tasks as part
+of the SIMMC 2.0 Challenge (Kottur et al. 2021). Specifically, we use TOD-BERT
+and LXMERT based models, compare them to a number of baselines and provide
+ablation experiments. Our results show that (1) language models are able to
+exploit correlations in the data to detect ambiguity; and (2) unimodal
+coreference resolution models can avoid the need for a vision component,
+through the use of smart object representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2022 DSTC10 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FacTool: Factuality Detection in Generative AI -- A Tool Augmented
+  Framework for Multi-Task and Multi-Domain Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13528v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13528v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        I-Chun Chern, Steffi Chern, Shiqi Chen, Weizhe Yuan, Kehua Feng, Chunting Zhou, Junxian He, Graham Neubig, Pengfei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of generative pre-trained models has facilitated the synthesis
+of high-quality text, but it has also posed challenges in identifying factual
+errors in the generated text. In particular: (1) A wider range of tasks now
+face an increasing risk of containing factual errors when handled by generative
+models. (2) Generated texts tend to be lengthy and lack a clearly defined
+granularity for individual facts. (3) There is a scarcity of explicit evidence
+available during the process of fact checking. With the above challenges in
+mind, in this paper, we propose FacTool, a task and domain agnostic framework
+for detecting factual errors of texts generated by large language models (e.g.,
+ChatGPT). Experiments on four different tasks (knowledge-based QA, code
+generation, mathematical reasoning, and scientific literature review) show the
+efficacy of the proposed method. We release the code of FacTool associated with
+ChatGPT plugin interface at https://github.com/GAIR-NLP/factool .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Explainable and Language-Agnostic LLMs: Symbolic Reverse
+  Engineering of Language at Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00017v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00017v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Walid S. Saba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved a milestone that undenia-bly
+changed many held beliefs in artificial intelligence (AI). However, there
+remains many limitations of these LLMs when it comes to true language
+understanding, limitations that are a byproduct of the under-lying architecture
+of deep neural networks. Moreover, and due to their subsymbolic nature,
+whatever knowledge these models acquire about how language works will always be
+buried in billions of microfeatures (weights), none of which is meaningful on
+its own, making such models hopelessly unexplainable. To address these
+limitations, we suggest com-bining the strength of symbolic representations
+with what we believe to be the key to the success of LLMs, namely a successful
+bottom-up re-verse engineering of language at scale. As such we argue for a
+bottom-up reverse engineering of language in a symbolic setting. Hints on what
+this project amounts to have been suggested by several authors, and we discuss
+in some detail here how this project could be accomplished.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Draft, preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Media<span class="highlight-title">GPT</span> : A Large Language Model For Chinese Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10930v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10930v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhonghao Wang, Zijia Lu, Bo Jin, Haiying Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown remarkable capabilities in generating
+high-quality text and making predictions based on large amounts of data,
+including the media domain. However, in practical applications, the differences
+between the media's use cases and the general-purpose applications of LLMs have
+become increasingly apparent, especially Chinese. This paper examines the
+unique characteristics of media-domain-specific LLMs compared to general LLMs,
+designed a diverse set of task instruction types to cater the specific
+requirements of the domain and constructed unique datasets that are tailored to
+the media domain. Based on these, we proposed MediaGPT, a domain-specific LLM
+for the Chinese media domain, training by domain-specific data and experts SFT
+data. By performing human experts evaluation and strong model evaluation on a
+validation set, this paper demonstrated that MediaGPT outperforms mainstream
+models on various Chinese media domain tasks and verifies the importance of
+domain data and domain-defined prompt types for building an effective
+domain-specific LLM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ No Train No Gain: Revisiting Efficient Training Algorithms For
+  <span class="highlight-title">Transformer</span>-based Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06440v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06440v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean Kaddour, Oscar Key, Piotr Nawrot, Pasquale Minervini, Matt J. Kusner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The computation necessary for training Transformer-based language models has
+skyrocketed in recent years. This trend has motivated research on efficient
+training algorithms designed to improve training, validation, and downstream
+performance faster than standard training. In this work, we revisit three
+categories of such algorithms: dynamic architectures (layer stacking, layer
+dropping), batch selection (selective backprop, RHO loss), and efficient
+optimizers (Lion, Sophia). When pre-training BERT and T5 with a fixed
+computation budget using such methods, we find that their training, validation,
+and downstream gains vanish compared to a baseline with a fully-decayed
+learning rate. We define an evaluation protocol that enables computation to be
+done on arbitrary machines by mapping all computation time to a reference
+machine which we call reference system time. We discuss the limitations of our
+proposed protocol and release our code to encourage rigorous research in
+efficient training procedures: https://github.com/JeanKaddour/NoTrainNoGain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Derivative Free Weight-space Ensembling <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03506v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03506v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dean Ninalga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work suggests that interpolating between the weights of two
+specialized language models can transfer knowledge between tasks in a way that
+multi-task learning cannot. However, very few have explored interpolation
+between more than two models, where each has a distinct knowledge base. In this
+paper, we introduce Derivative Free Weight-space Ensembling (DFWE), a new
+few-sample task transfer approach for open-domain dialogue. Our framework
+creates a set of diverse expert language models trained using a predefined set
+of source tasks. Next, we finetune each of the expert models on the target
+task, approaching the target task from several distinct knowledge bases.
+Finally, we linearly interpolate between the model weights using a
+gradient-free-optimization algorithm, to efficiently find a good interpolation
+weighting. We demonstrate the effectiveness of the method on FETA-Friends
+outperforming the standard pretrain-finetune approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>For consideration at the 5th Workshop on NLP for Conversational AI
+  (co-located with ACL 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ "Are you telling me to put glasses on the dog?'' Content-Grounded
+  Annotation of Instruction Clarification Requests in the CoDraw <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02377v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02377v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brielen Madureira, David Schlangen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction Clarification Requests are a mechanism to solve communication
+problems, which is very functional in instruction-following interactions.
+Recent work has argued that the CoDraw dataset is a valuable source of
+naturally occurring iCRs. Beyond identifying when iCRs should be made, dialogue
+models should also be able to generate them with suitable form and content. In
+this work, we introduce CoDraw-iCR (v2), extending the existing iCR identifiers
+with fine-grained information grounded in the underlying dialogue game items
+and possible actions. Our annotation can serve to model and evaluate repair
+capabilities of dialogue agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A 2-page version will appear at SemDial 2023 as a poster</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Do Emergent Abilities Exist in Quantized Large Language Models: An
+  Empirical Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08072v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08072v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiyu Liu, Zikang Liu, Ze-Feng Gao, Dawei Gao, Wayne Xin Zhao, Yaliang Li, Bolin Ding, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the superior performance, Large Language Models~(LLMs) require
+significant computational resources for deployment and use. To overcome this
+issue, quantization methods have been widely applied to reduce the memory
+footprint of LLMs as well as increasing the inference rate. However, a major
+challenge is that low-bit quantization methods often lead to performance
+degradation. It is important to understand how quantization impacts the
+capacity of LLMs. Different from previous studies focused on overall
+performance, this work aims to investigate the impact of quantization on
+\emph{emergent abilities}, which are important characteristics that distinguish
+LLMs from small language models. Specially, we examine the abilities of
+in-context learning, chain-of-thought reasoning, and instruction-following in
+quantized LLMs. Our empirical experiments show that these emergent abilities
+still exist in 4-bit quantization models, while 2-bit models encounter severe
+performance degradation on the test of these abilities. To improve the
+performance of low-bit models, we conduct two special experiments: (1)
+fine-gained impact analysis that studies which components (or substructures)
+are more sensitive to quantization, and (2) performance compensation through
+model fine-tuning. Our work derives a series of important findings to
+understand the impact of quantization on emergent abilities, and sheds lights
+on the possibilities of extremely low-bit quantization for LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAS Video-QA: Self-Adaptive Sampling for Efficient Video
+  Question-Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04192v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04192v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Han, Hui Chen, Min-Yen Kan, Soujanya Poria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video question--answering is a fundamental task in the field of video
+understanding. Although current vision--language models (VLMs) equipped with
+Video Transformers have enabled temporal modeling and yielded superior results,
+they are at the cost of huge computational power and thus too expensive to
+deploy in real-time application scenarios. An economical workaround only
+samples a small portion of frames to represent the main content of that video
+and tune an image--text model on these sampled frames. Recent video
+understanding models usually randomly sample a set of frames or clips,
+regardless of internal correlations between their visual contents, nor their
+relevance to the problem. We argue that such kinds of aimless sampling may omit
+the key frames from which the correct answer can be deduced, and the situation
+gets worse when the sampling sparsity increases, which always happens as the
+video lengths increase. To mitigate this issue, we propose two frame sampling
+strategies, namely the most domain frames (MDF) and most implied frames (MIF),
+to maximally preserve those frames that are most likely vital to the given
+questions. MDF passively minimizes the risk of key frame omission in a
+bootstrap manner, while MIS actively searches key frames customized for each
+video--question pair with the assistance of auxiliary models. The experimental
+results on three public datasets from three advanced VLMs (CLIP, GIT and
+All-in-one) demonstrate that our proposed strategies can boost the performance
+for image--text pretrained models. The source codes pertaining to the method
+proposed in this paper are publicly available at
+https://github.com/declare-lab/sas-vqa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Table and Image Generation for Investigating Knowledge of Entities in
+  <span class="highlight-title">Pre-train</span>ed Vision and Language Models <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02115v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02115v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hidetaka Kamigaito, Katsuhiko Hayashi, Taro Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a table and image generation task to verify how the
+knowledge about entities acquired from natural language is retained in Vision &
+Language (V&L) models. This task consists of two parts: the first is to
+generate a table containing knowledge about an entity and its related image,
+and the second is to generate an image from an entity with a caption and a
+table containing related knowledge of the entity. In both tasks, the model must
+know the entities used to perform the generation properly. We created the
+Wikipedia Table and Image Generation (WikiTIG) dataset from about 200,000
+infoboxes in English Wikipedia articles to perform the proposed tasks. We
+evaluated the performance on the tasks with respect to the above research
+question using the V&L model OFA, which has achieved state-of-the-art results
+in multiple tasks. Experimental results show that OFA forgets part of its
+entity knowledge by pre-training as a complement to improve the performance of
+image related tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive Comparison of <span class="highlight-title">Pre-train</span>ing Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.11483v9">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.11483v9.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the development of pre-trained language models has brought natural
+language processing (NLP) tasks to the new state-of-the-art. In this paper we
+explore the efficiency of various pre-trained language models. We pre-train a
+list of transformer-based models with the same amount of text and the same
+training steps. The experimental results shows that the most improvement upon
+the origin BERT is adding the RNN-layer to capture more contextual information
+for short text understanding. But the conclusion is: There are no remarkable
+improvement for short text understanding for similar BERT structures.
+Data-centric method[12] can achieve better performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Context-aware attention layers coupled with optimal transport domain
+  adaptation and multimodal fusion methods for recognizing dementia from
+  spontaneous speech 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16406v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16406v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Loukas Ilias, Dimitris Askounis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Alzheimer's disease (AD) constitutes a complex neurocognitive disease and is
+the main cause of dementia. Although many studies have been proposed targeting
+at diagnosing dementia through spontaneous speech, there are still limitations.
+Existing state-of-the-art approaches, which propose multimodal methods, train
+separately language and acoustic models, employ majority-vote approaches, and
+concatenate the representations of the different modalities either at the input
+level, i.e., early fusion, or during training. Also, some of them employ
+self-attention layers, which calculate the dependencies between representations
+without considering the contextual information. In addition, no prior work has
+taken into consideration the model calibration. To address these limitations,
+we propose some new methods for detecting AD patients, which capture the intra-
+and cross-modal interactions. First, we convert the audio files into log-Mel
+spectrograms, their delta, and delta-delta and create in this way an image per
+audio file consisting of three channels. Next, we pass each transcript and
+image through BERT and DeiT models respectively. After that, context-based
+self-attention layers, self-attention layers with a gate model, and optimal
+transport domain adaptation methods are employed for capturing the intra- and
+inter-modal interactions. Finally, we exploit two methods for fusing the self
+and cross-attention features. For taking into account the model calibration, we
+apply label smoothing. We use both performance and calibration metrics.
+Experiments conducted on the ADReSS and ADReSSo Challenge datasets indicate the
+efficacy of our introduced approaches over existing research initiatives with
+our best performing model reaching Accuracy and F1-score up to 91.25% and
+91.06% respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Knowledge-Based Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Science in the Era of Chat<span class="highlight-title">GPT</span>, Large Language Models and Generative AI:
+  Challenges for Research Ethics and How to Respond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15299v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15299v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evangelos Pournaras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models of artificial intelligence (AI), such as ChatGPT, find
+remarkable but controversial applicability in science and research. This paper
+reviews epistemological challenges, ethical and integrity risks in science
+conduct in the advent of generative AI. This is with the aim to lay new timely
+foundations for a high-quality research ethics review. The role of AI language
+models as a research instrument and subject is scrutinized along with ethical
+implications for scientists, participants and reviewers. New emerging practices
+for research ethics review are discussed, concluding with ten recommendations
+that shape a response for a more responsible research conduct in the era of AI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PlaSma: Making Small Language Models Better Procedural Knowledge Models
+  for (Counterfactual) Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19472v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19472v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Faeze Brahman, Chandra Bhagavatula, Valentina Pyatkin, Jena D. Hwang, Xiang Lorraine Li, Hirona J. Arai, Soumya Sanyal, Keisuke Sakaguchi, Xiang Ren, Yejin Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Procedural planning, which entails decomposing a high-level goal into a
+sequence of temporally ordered steps, is an important yet intricate task for
+machines. It involves integrating common-sense knowledge to reason about
+complex contextualized situations that are often counterfactual, e.g.
+"scheduling a doctor's appointment without a phone". While current approaches
+show encouraging results using large language models (LLMs), they are hindered
+by drawbacks such as costly API calls and reproducibility issues. In this
+paper, we advocate planning using smaller language models. We present PlaSma, a
+novel two-pronged approach to endow small language models with procedural
+knowledge and (counterfactual) planning capabilities. More concretely, we
+develop symbolic procedural knowledge distillation to enhance the implicit
+knowledge in small language models and an inference-time algorithm to
+facilitate more structured and accurate reasoning. In addition, we introduce a
+novel task, Counterfactual Planning, that requires a revision of a plan to cope
+with a counterfactual situation. In both the original and counterfactual
+setting, we show that orders-of-magnitude smaller models (770M-11B parameters)
+can compete and often surpass their larger teacher models' capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>cited new paper, 27 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Chat<span class="highlight-title">GPT</span> a Good Personality Recognizer? A Preliminary Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03952v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03952v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Ji, Wen Wu, Hong Zheng, Yi Hu, Xi Chen, Liang He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, personality has been regarded as a valuable personal factor
+being incorporated into numerous tasks such as sentiment analysis and product
+recommendation. This has led to widespread attention to text-based personality
+recognition task, which aims to identify an individual's personality based on
+given text. Considering that ChatGPT has recently exhibited remarkable
+abilities on various natural language processing tasks, we provide a
+preliminary evaluation of ChatGPT on text-based personality recognition task
+for generating effective personality data. Concretely, we employ a variety of
+prompting strategies to explore ChatGPT's ability in recognizing personality
+from given text, especially the level-oriented prompting strategy we designed
+for guiding ChatGPT in analyzing given text at a specified level. The
+experimental results on two representative real-world datasets reveal that
+ChatGPT with zero-shot chain-of-thought prompting exhibits impressive
+personality recognition ability and is capable to provide natural language
+explanations through text-based logical reasoning. Furthermore, by employing
+the level-oriented prompting strategy to optimize zero-shot chain-of-thought
+prompting, the performance gap between ChatGPT and corresponding
+state-of-the-art model has been narrowed even more. However, we observe that
+ChatGPT shows unfairness towards certain sensitive demographic attributes such
+as gender and age. Additionally, we discover that eliciting the personality
+recognition ability of ChatGPT helps improve its performance on
+personality-related downstream tasks such as sentiment classification and
+stress prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 13 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">126</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Virtual Mirrors: Non-Line-of-Sight Imaging Beyond the Third Bounce 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diego Royo, Talha Sultan, Adolfo Muñoz, Khadijeh Masumnia-Bisheh, Eric Brandt, Diego Gutierrez, Andreas Velten, Julio Marco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non-line-of-sight (NLOS) imaging methods are capable of reconstructing
+complex scenes that are not visible to an observer using indirect illumination.
+However, they assume only third-bounce illumination, so they are currently
+limited to single-corner configurations, and present limited visibility when
+imaging surfaces at certain orientations. To reason about and tackle these
+limitations, we make the key observation that planar diffuse surfaces behave
+specularly at wavelengths used in the computational wave-based NLOS imaging
+domain. We call such surfaces virtual mirrors. We leverage this observation to
+expand the capabilities of NLOS imaging using illumination beyond the third
+bounce, addressing two problems: imaging single-corner objects at limited
+visibility angles, and imaging objects hidden behind two corners. To image
+objects at limited visibility angles, we first analyze the reflections of the
+known illuminated point on surfaces of the scene as an estimator of the
+position and orientation of objects with limited visibility. We then image
+those limited visibility objects by computationally building secondary
+apertures at other surfaces that observe the target object from a direct
+visibility perspective. Beyond single-corner NLOS imaging, we exploit the
+specular behavior of virtual mirrors to image objects hidden behind a second
+corner by imaging the space behind such virtual mirrors, where the mirror image
+of objects hidden around two corners is formed. No specular surfaces were
+involved in the making of this paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAMo: Leveraging Memory and Attention for Monocular Video Depth
+  Estimation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rajeev Yasarla, Hong Cai, Jisoo Jeong, Yunxiao Shi, Risheek Garrepalli, Fatih Porikli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose MAMo, a novel memory and attention frame-work for monocular video
+depth estimation. MAMo can augment and improve any single-image depth
+estimation networks into video depth estimation models, enabling them to take
+advantage of the temporal information to predict more accurate depth. In MAMo,
+we augment model with memory which aids the depth prediction as the model
+streams through the video. Specifically, the memory stores learned visual and
+displacement tokens of the previous time instances. This allows the depth
+network to cross-reference relevant features from the past when predicting
+depth on the current frame. We introduce a novel scheme to continuously update
+the memory, optimizing it to keep tokens that correspond with both the past and
+the present visual information. We adopt attention-based approach to process
+memory features where we first learn the spatio-temporal relation among the
+resultant visual and displacement memory tokens using self-attention module.
+Further, the output features of self-attention are aggregated with the current
+visual features through cross-attention. The cross-attended features are
+finally given to a decoder to predict depth on the current frame. Through
+extensive experiments on several benchmarks, including KITTI, NYU-Depth V2, and
+DDAD, we show that MAMo consistently improves monocular depth estimation
+networks and sets new state-of-the-art (SOTA) accuracy. Notably, our MAMo video
+depth estimation provides higher accuracy with lower latency, when omparing to
+SOTA cost-volume-based video depth models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generalist Biomedical AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14334v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14334v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Tu, Shekoofeh Azizi, Danny Driess, Mike Schaekermann, Mohamed Amin, Pi-Chuan Chang, Andrew Carroll, Chuck Lau, Ryutaro Tanno, Ira Ktena, Basil Mustafa, Aakanksha Chowdhery, Yun Liu, Simon Kornblith, David Fleet, Philip Mansfield, Sushant Prakash, Renee Wong, Sunny Virmani, Christopher Semturs, S Sara Mahdavi, Bradley Green, Ewa Dominowska, Blaise Aguera y Arcas, Joelle Barral, Dale Webster, Greg S. Corrado, Yossi Matias, Karan Singhal, Pete Florence, Alan Karthikesalingam, Vivek Natarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medicine is inherently multimodal, with rich data modalities spanning text,
+imaging, genomics, and more. Generalist biomedical artificial intelligence (AI)
+systems that flexibly encode, integrate, and interpret this data at scale can
+potentially enable impactful applications ranging from scientific discovery to
+care delivery. To enable the development of these models, we first curate
+MultiMedBench, a new multimodal biomedical benchmark. MultiMedBench encompasses
+14 diverse tasks such as medical question answering, mammography and
+dermatology image interpretation, radiology report generation and
+summarization, and genomic variant calling. We then introduce Med-PaLM
+Multimodal (Med-PaLM M), our proof of concept for a generalist biomedical AI
+system. Med-PaLM M is a large multimodal generative model that flexibly encodes
+and interprets biomedical data including clinical language, imaging, and
+genomics with the same set of model weights. Med-PaLM M reaches performance
+competitive with or exceeding the state of the art on all MultiMedBench tasks,
+often surpassing specialist models by a wide margin. We also report examples of
+zero-shot generalization to novel medical concepts and tasks, positive transfer
+learning across tasks, and emergent zero-shot medical reasoning. To further
+probe the capabilities and limitations of Med-PaLM M, we conduct a radiologist
+evaluation of model-generated (and human) chest X-ray reports and observe
+encouraging performance across model scales. In a side-by-side ranking on 246
+retrospective chest X-rays, clinicians express a pairwise preference for
+Med-PaLM M reports over those produced by radiologists in up to 40.50% of
+cases, suggesting potential clinical utility. While considerable work is needed
+to validate these models in real-world use cases, our results represent a
+milestone towards the development of generalist biomedical AI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Event-based Vision for Early Prediction of Manipulation Actions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Deniz, Cornelia Fermuller, Eduardo Ros, Manuel Rodriguez-Alvarez, Francisco Barranco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neuromorphic visual sensors are artificial retinas that output sequences of
+asynchronous events when brightness changes occur in the scene. These sensors
+offer many advantages including very high temporal resolution, no motion blur
+and smart data compression ideal for real-time processing. In this study, we
+introduce an event-based dataset on fine-grained manipulation actions and
+perform an experimental study on the use of transformers for action prediction
+with events. There is enormous interest in the fields of cognitive robotics and
+human-robot interaction on understanding and predicting human actions as early
+as possible. Early prediction allows anticipating complex stages for planning,
+enabling effective and real-time interaction. Our Transformer network uses
+events to predict manipulation actions as they occur, using online inference.
+The model succeeds at predicting actions early on, building up confidence over
+time and achieving state-of-the-art classification. Moreover, the
+attention-based transformer architecture allows us to study the role of the
+spatio-temporal patterns selected by the model. Our experiments show that the
+Transformer network captures action dynamic features outperforming video-based
+approaches and succeeding with scenarios where the differences between actions
+lie in very subtle cues. Finally, we release the new event dataset, which is
+the first in the literature for manipulation action recognition. Code will be
+available at https://github.com/DaniDeniz/EventVisionTransformer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual Instruction Inversion: Image Editing via Visual <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thao Nguyen, Yuheng Li, Utkarsh Ojha, Yong Jae Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-conditioned image editing has emerged as a powerful tool for editing
+images. However, in many situations, language can be ambiguous and ineffective
+in describing specific image edits. When faced with such challenges, visual
+prompts can be a more informative and intuitive way to convey ideas. We present
+a method for image editing via visual prompting. Given pairs of example that
+represent the "before" and "after" images of an edit, our goal is to learn a
+text-based editing direction that can be used to perform the same edit on new
+images. We leverage the rich, pretrained editing capabilities of text-to-image
+diffusion models by inverting visual prompts into editing instructions. Our
+results show that with just one example pair, we can achieve competitive
+results compared to state-of-the-art text-conditioned image editing frameworks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://thaoshibe.github.io/visii/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unraveling the Complexity of Splitting Sequential Data: Tackling
+  Challenges in Video and Time Series Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14294v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14294v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diego Botache, Kristina Dingel, Rico Huhnstock, Arno Ehresmann, Bernhard Sick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Splitting of sequential data, such as videos and time series, is an essential
+step in various data analysis tasks, including object tracking and anomaly
+detection. However, splitting sequential data presents a variety of challenges
+that can impact the accuracy and reliability of subsequent analyses. This
+concept article examines the challenges associated with splitting sequential
+data, including data acquisition, data representation, split ratio selection,
+setting up quality criteria, and choosing suitable selection strategies. We
+explore these challenges through two real-world examples: motor test benches
+and particle tracking in liquids.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ US & MR Image-Fusion Based on Skin Co-Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14288v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14288v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martina Paccini, Giacomo Paschina, Stefano De Beni, Giuseppe Patanè
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The study and development of innovative solutions for the advanced
+visualisation, representation and analysis of medical images offer different
+research directions. Current practice in medical imaging consists in combining
+real-time US with imaging modalities that allow internal anatomy acquisitions,
+such as CT, MRI, PET or similar. Application of image-fusion approaches can be
+found in tracking surgical tools and/or needles, in real-time during
+interventions. Thus, this work proposes a fusion imaging system for the
+registration of CT and MRI images with real-time US acquisition leveraging a 3D
+camera sensor. The main focus of the work is the portability of the system and
+its applicability to different anatomical districts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large-scale Fully-Unsupervised Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14278v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14278v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Bertocco, Fernanda Andaló, Terrance E. Boult, Anderson Rocha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fully-unsupervised Person and Vehicle Re-Identification have received
+increasing attention due to their broad applicability in surveillance,
+forensics, event understanding, and smart cities, without requiring any manual
+annotation. However, most of the prior art has been evaluated in datasets that
+have just a couple thousand samples. Such small-data setups often allow the use
+of costly techniques in time and memory footprints, such as Re-Ranking, to
+improve clustering results. Moreover, some previous work even pre-selects the
+best clustering hyper-parameters for each dataset, which is unrealistic in a
+large-scale fully-unsupervised scenario. In this context, this work tackles a
+more realistic scenario and proposes two strategies to learn from large-scale
+unlabeled data. The first strategy performs a local neighborhood sampling to
+reduce the dataset size in each iteration without violating neighborhood
+relationships. A second strategy leverages a novel Re-Ranking technique, which
+has a lower time upper bound complexity and reduces the memory complexity from
+O(n^2) to O(kn) with k << n. To avoid the pre-selection of specific
+hyper-parameter values for the clustering algorithm, we also present a novel
+scheduling algorithm that adjusts the density parameter during training, to
+leverage the diversity of samples and keep the learning robust to noisy
+labeling. Finally, due to the complementary knowledge learned by different
+models, we also introduce a co-training strategy that relies upon the
+permutation of predicted pseudo-labels, among the backbones, with no need for
+any hyper-parameters or weighting optimization. The proposed methodology
+outperforms the state-of-the-art methods in well-known benchmarks and in the
+challenging large-scale Veri-Wild dataset, with a faster and memory-efficient
+Re-Ranking strategy, and a large-scale, noisy-robust, and ensemble-based
+learning approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted for possible publication in an IEEE
+  Transactions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ G2L: Semantically Aligned and Uniform Video Grounding via Geodesic and
+  Game Theory <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14277v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14277v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongxiang Li, Meng Cao, Xuxin Cheng, Yaowei Li, Zhihong Zhu, Yuexian Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent video grounding works attempt to introduce vanilla contrastive
+learning into video grounding. However, we claim that this naive solution is
+suboptimal. Contrastive learning requires two key properties: (1)
+\emph{alignment} of features of similar samples, and (2) \emph{uniformity} of
+the induced distribution of the normalized features on the hypersphere. Due to
+two annoying issues in video grounding: (1) the co-existence of some visual
+entities in both ground truth and other moments, \ie semantic overlapping; (2)
+only a few moments in the video are annotated, \ie sparse annotation dilemma,
+vanilla contrastive learning is unable to model the correlations between
+temporally distant moments and learned inconsistent video representations. Both
+characteristics lead to vanilla contrastive learning being unsuitable for video
+grounding. In this paper, we introduce Geodesic and Game Localization (G2L), a
+semantically aligned and uniform video grounding framework via geodesic and
+game theory. We quantify the correlations among moments leveraging the geodesic
+distance that guides the model to learn the correct cross-modal
+representations. Furthermore, from the novel perspective of game theory, we
+propose semantic Shapley interaction based on geodesic distance sampling to
+learn fine-grained semantic alignment in similar moments. Experiments on three
+benchmarks demonstrate the effectiveness of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deepfake Image Generation for Improved Brain Tumor Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roa'a Al-Emaryeen, Sara Al-Nahhas, Fatima Himour, Waleed Mahafza, Omar Al-Kadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the world progresses in technology and health, awareness of disease by
+revealing asymptomatic signs improves. It is important to detect and treat
+tumors in early stage as it can be life-threatening. Computer-aided
+technologies are used to overcome lingering limitations facing disease
+diagnosis, while brain tumor segmentation remains a difficult process,
+especially when multi-modality data is involved. This is mainly attributed to
+ineffective training due to lack of data and corresponding labelling. This work
+investigates the feasibility of employing deep-fake image generation for
+effective brain tumor segmentation. To this end, a Generative Adversarial
+Network was used for image-to-image translation for increasing dataset size,
+followed by image segmentation using a U-Net-based convolutional neural network
+trained with deepfake images. Performance of the proposed approach is compared
+with ground truth of four publicly available datasets. Results show improved
+performance in terms of image segmentation quality metrics, and could
+potentially assist when training with limited data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 8 figures, 2 tables, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Artifact Restoration in Histology Images with Diffusion Probabilistic
+  Models <span class="chip">MICCAI2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenqi He, Junjun He, Jin Ye, Yiqing Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Histological whole slide images (WSIs) can be usually compromised by
+artifacts, such as tissue folding and bubbles, which will increase the
+examination difficulty for both pathologists and Computer-Aided Diagnosis (CAD)
+systems. Existing approaches to restoring artifact images are confined to
+Generative Adversarial Networks (GANs), where the restoration process is
+formulated as an image-to-image transfer. Those methods are prone to suffer
+from mode collapse and unexpected mistransfer in the stain style, leading to
+unsatisfied and unrealistic restored images. Innovatively, we make the first
+attempt at a denoising diffusion probabilistic model for histological artifact
+restoration, namely ArtiFusion.Specifically, ArtiFusion formulates the artifact
+region restoration as a gradual denoising process, and its training relies
+solely on artifact-free images to simplify the training complexity.Furthermore,
+to capture local-global correlations in the regional artifact restoration, a
+novel Swin-Transformer denoising architecture is designed, along with a time
+token scheme. Our extensive evaluations demonstrate the effectiveness of
+ArtiFusion as a pre-processing method for histology analysis, which can
+successfully preserve the tissue structures and stain style in artifact-free
+regions during the restoration. Code is available at
+https://github.com/zhenqi-he/ArtiFusion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MICCAI2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse Double Descent in Vision <span class="highlight-title">Transformer</span>s: real or phantom threat? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14253v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14253v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Quétu, Marta Milovanovic, Enzo Tartaglione
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision transformers (ViT) have been of broad interest in recent theoretical
+and empirical works. They are state-of-the-art thanks to their attention-based
+approach, which boosts the identification of key features and patterns within
+images thanks to the capability of avoiding inductive bias, resulting in highly
+accurate image analysis. Meanwhile, neoteric studies have reported a ``sparse
+double descent'' phenomenon that can occur in modern deep-learning models,
+where extremely over-parametrized models can generalize well. This raises
+practical questions about the optimal size of the model and the quest over
+finding the best trade-off between sparsity and performance is launched: are
+Vision Transformers also prone to sparse double descent? Can we find a way to
+avoid such a phenomenon? Our work tackles the occurrence of sparse double
+descent on ViTs. Despite some works that have shown that traditional
+architectures, like Resnet, are condemned to the sparse double descent
+phenomenon, for ViTs we observe that an optimally-tuned $\ell_2$ regularization
+relieves such a phenomenon. However, everything comes at a cost: optimal lambda
+will sacrifice the potential compression of the ViT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fluorescent Neuronal Cells v2: Multi-Task, Multi-Format Annotations for
+  Deep Learning in Microscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Clissa, Antonio Macaluso, Roberto Morelli, Alessandra Occhinegro, Emiliana Piscitiello, Ludovico Taddei, Marco Luppi, Roberto Amici, Matteo Cerri, Timna Hitrec, Lorenzo Rinaldi, Antonio Zoccoli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fluorescent Neuronal Cells v2 is a collection of fluorescence microscopy
+images and the corresponding ground-truth annotations, designed to foster
+innovative research in the domains of Life Sciences and Deep Learning. This
+dataset encompasses three image collections in which rodent neuronal cells'
+nuclei and cytoplasm are stained with diverse markers to highlight their
+anatomical or functional characteristics. Alongside the images, we provide
+ground-truth annotations for several learning tasks, including semantic
+segmentation, object detection, and counting. The contribution is two-fold.
+First, given the variety of annotations and their accessible formats, we
+envision our work facilitating methodological advancements in computer vision
+approaches for segmentation, detection, feature learning, unsupervised and
+self-supervised learning, transfer learning, and related areas. Second, by
+enabling extensive exploration and benchmarking, we hope Fluorescent Neuronal
+Cells v2 will catalyze breakthroughs in fluorescence microscopy analysis and
+promote cutting-edge discoveries in life sciences. The data are available at:
+https://amsacta.unibo.it/id/eprint/7347
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages; 5 figures; 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Defending Adversarial Patches via Joint Region Localizing and Inpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14242v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14242v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junwen Chen, Xingxing Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks are successfully used in various applications, but show
+their vulnerability to adversarial examples. With the development of
+adversarial patches, the feasibility of attacks in physical scenes increases,
+and the defenses against patch attacks are urgently needed. However, defending
+such adversarial patch attacks is still an unsolved problem. In this paper, we
+analyse the properties of adversarial patches, and find that: on the one hand,
+adversarial patches will lead to the appearance or contextual inconsistency in
+the target objects; on the other hand, the patch region will show abnormal
+changes on the high-level feature maps of the objects extracted by a backbone
+network. Considering the above two points, we propose a novel defense method
+based on a ``localizing and inpainting" mechanism to pre-process the input
+examples. Specifically, we design an unified framework, where the ``localizing"
+sub-network utilizes a two-branch structure to represent the above two aspects
+to accurately detect the adversarial patch region in the image. For the
+``inpainting" sub-network, it utilizes the surrounding contextual cues to
+recover the original content covered by the adversarial patch. The quality of
+inpainted images is also evaluated by measuring the appearance consistency and
+the effects of adversarial attacks. These two sub-networks are then jointly
+trained via an iterative optimization manner. In this way, the ``localizing"
+and ``inpainting" modules can interact closely with each other, and thus learn
+a better solution. A series of experiments versus traffic sign classification
+and detection tasks are conducted to defend against various adversarial patch
+attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DisguisOR: Holistic Face Anonymization for the Operating Room 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14241v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14241v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lennart Bastian, Tony Danjun Wang, Tobias Czempiel, Benjamin Busam, Nassir Navab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Recent advances in Surgical Data Science (SDS) have contributed to
+an increase in video recordings from hospital environments. While methods such
+as surgical workflow recognition show potential in increasing the quality of
+patient care, the quantity of video data has surpassed the scale at which
+images can be manually anonymized. Existing automated 2D anonymization methods
+under-perform in Operating Rooms (OR), due to occlusions and obstructions. We
+propose to anonymize multi-view OR recordings using 3D data from multiple
+camera streams. Methods: RGB and depth images from multiple cameras are fused
+into a 3D point cloud representation of the scene. We then detect each
+individual's face in 3D by regressing a parametric human mesh model onto
+detected 3D human keypoints and aligning the face mesh with the fused 3D point
+cloud. The mesh model is rendered into every acquired camera view, replacing
+each individual's face. Results: Our method shows promise in locating faces at
+a higher rate than existing approaches. DisguisOR produces geometrically
+consistent anonymizations for each camera view, enabling more realistic
+anonymization that is less detrimental to downstream tasks. Conclusion:
+Frequent obstructions and crowding in operating rooms leaves significant room
+for improvement for off-the-shelf anonymization methods. DisguisOR addresses
+privacy on a scene level and has the potential to facilitate further research
+in SDS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IPCAI 2023; 21 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computational Approaches for Traditional Chinese Painting: From the "Six
+  Principles of Painting" Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14227v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14227v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Zhang, Jian-Wei Zhang, Kam Kwai Wong, Yifang Wang, Yingchaojie Feng, Luwei Wang, Wei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional Chinese Painting (TCP) is an invaluable cultural heritage
+resource and a unique visual art style. In recent years, increasing interest
+has been placed on digitalizing TCPs to preserve and revive the culture. The
+resulting digital copies have enabled the advancement of computational methods
+for structured and systematic understanding of TCPs. To explore this topic, we
+conducted an in-depth analysis of 92 pieces of literature. We examined the
+current use of computer technologies on TCPs from three perspectives, based on
+numerous conversations with specialists. First, in light of the "Six Principles
+of Painting" theory, we categorized the articles according to their research
+focus on artistic elements. Second, we created a four-stage framework to
+illustrate the purposes of TCP applications. Third, we summarized the popular
+computational techniques applied to TCPs. The framework also provides insights
+into potential applications and future prospects, with professional opinion.
+The list of surveyed publications and related information is available online
+at https://ca4tcp.com.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ADAPT: Efficient Multi-Agent Trajectory Prediction with Adaptation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Görkay Aydemir, Adil Kaan Akan, Fatma Güney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Forecasting future trajectories of agents in complex traffic scenes requires
+reliable and efficient predictions for all agents in the scene. However,
+existing methods for trajectory prediction are either inefficient or sacrifice
+accuracy. To address this challenge, we propose ADAPT, a novel approach for
+jointly predicting the trajectories of all agents in the scene with dynamic
+weight learning. Our approach outperforms state-of-the-art methods in both
+single-agent and multi-agent settings on the Argoverse and Interaction
+datasets, with a fraction of their computational overhead. We attribute the
+improvement in our performance: first, to the adaptive head augmenting the
+model capacity without increasing the model size; second, to our design choices
+in the endpoint-conditioned prediction, reinforced by gradient stopping. Our
+analyses show that ADAPT can focus on each agent with adaptive prediction,
+allowing for accurate predictions efficiently. https://KUIS-AI.github.io/adapt
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Resolution-Aware Design of Atrous Rates for Semantic Segmentation
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14179v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14179v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bum Jun Kim, Hyeyeon Choi, Hyeonah Jang, Sang Woo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DeepLab is a widely used deep neural network for semantic segmentation, whose
+success is attributed to its parallel architecture called atrous spatial
+pyramid pooling (ASPP). ASPP uses multiple atrous convolutions with different
+atrous rates to extract both local and global information. However, fixed
+values of atrous rates are used for the ASPP module, which restricts the size
+of its field of view. In principle, atrous rate should be a hyperparameter to
+change the field of view size according to the target task or dataset. However,
+the manipulation of atrous rate is not governed by any guidelines. This study
+proposes practical guidelines for obtaining an optimal atrous rate. First, an
+effective receptive field for semantic segmentation is introduced to analyze
+the inner behavior of segmentation networks. We observed that the use of ASPP
+module yielded a specific pattern in the effective receptive field, which was
+traced to reveal the module's underlying mechanism. Accordingly, we derive
+practical guidelines for obtaining the optimal atrous rate, which should be
+controlled based on the size of input image. Compared to other values, using
+the optimal atrous rate consistently improved the segmentation results across
+multiple datasets, including the STARE, CHASE_DB1, HRF, Cityscapes, and iSAID
+datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-definition event frame generation using SoC FPGA devices <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14177v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14177v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krzysztof Blachut, Tomasz Kryjak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we have addressed the implementation of the accumulation and
+projection of high-resolution event data stream (HD -1280 x 720 pixels) onto
+the image plane in FPGA devices. The results confirm the feasibility of this
+approach, but there are a number of challenges, limitations and trade-offs to
+be considered. The required hardware resources of selected data
+representations, such as binary frame, event frame, exponentially decaying time
+surface and event frequency, were compared with those available on several
+popular platforms from AMD Xilinx. The resulting event frames can be used for
+typical vision algorithms, such as object classification and detection, using
+both classical and deep neural network methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted for the SPA 2023 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LOIS: Looking Out of Instance Semantics for Visual Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Zhang, Yeming Chen, Yaoru Sun, Fang Wang, Haibo Shi, Haoran Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual question answering (VQA) has been intensively studied as a multimodal
+task that requires effort in bridging vision and language to infer answers
+correctly. Recent attempts have developed various attention-based modules for
+solving VQA tasks. However, the performance of model inference is largely
+bottlenecked by visual processing for semantics understanding. Most existing
+detection methods rely on bounding boxes, remaining a serious challenge for VQA
+models to understand the causal nexus of object semantics in images and
+correctly infer contextual information. To this end, we propose a finer model
+framework without bounding boxes in this work, termed Looking Out of Instance
+Semantics (LOIS) to tackle this important issue. LOIS enables more fine-grained
+feature descriptions to produce visual facts. Furthermore, to overcome the
+label ambiguity caused by instance masks, two types of relation attention
+modules: 1) intra-modality and 2) inter-modality, are devised to infer the
+correct answers from the different multi-view features. Specifically, we
+implement a mutual relation attention module to model sophisticated and deeper
+visual semantic relations between instance objects and background information.
+In addition, our proposed attention model can further analyze salient image
+regions by focusing on important word-related questions. Experimental results
+on four benchmark VQA datasets prove that our proposed method has favorable
+performance in improving visual reasoning capability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Creative Birds: <span class="highlight-title">Self-Supervised</span> Single-View 3D Style Transfer <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14127v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14127v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renke Wang, Guimin Que, Shuo Chen, Xiang Li, Jun Li, Jian Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel method for single-view 3D style transfer
+that generates a unique 3D object with both shape and texture transfer. Our
+focus lies primarily on birds, a popular subject in 3D reconstruction, for
+which no existing single-view 3D transfer methods have been developed.The
+method we propose seeks to generate a 3D mesh shape and texture of a bird from
+two single-view images. To achieve this, we introduce a novel shape transfer
+generator that comprises a dual residual gated network (DRGNet), and a
+multi-layer perceptron (MLP). DRGNet extracts the features of source and target
+images using a shared coordinate gate unit, while the MLP generates spatial
+coordinates for building a 3D mesh. We also introduce a semantic UV texture
+transfer module that implements textural style transfer using semantic UV
+segmentation, which ensures consistency in the semantic meaning of the
+transferred regions. This module can be widely adapted to many existing
+approaches. Finally, our method constructs a novel 3D bird using a
+differentiable renderer. Experimental results on the CUB dataset verify that
+our method achieves state-of-the-art performance on the single-view 3D style
+transfer task. Code is available in
+https://github.com/wrk226/2D-to-3D-Evolution-Transfer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-modal Learning with Missing Modality via Shared-Specific Feature
+  Modelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hu Wang, Yuanhong Chen, Congbo Ma, Jodie Avery, Louise Hull, Gustavo Carneiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The missing modality issue is critical but non-trivial to be solved by
+multi-modal models. Current methods aiming to handle the missing modality
+problem in multi-modal tasks, either deal with missing modalities only during
+evaluation or train separate models to handle specific missing modality
+settings. In addition, these models are designed for specific tasks, so for
+example, classification models are not easily adapted to segmentation tasks and
+vice versa. In this paper, we propose the Shared-Specific Feature Modelling
+(ShaSpec) method that is considerably simpler and more effective than competing
+approaches that address the issues above. ShaSpec is designed to take advantage
+of all available input modalities during training and evaluation by learning
+shared and specific features to better represent the input data. This is
+achieved from a strategy that relies on auxiliary tasks based on distribution
+alignment and domain classification, in addition to a residual feature fusion
+procedure. Also, the design simplicity of ShaSpec enables its easy adaptation
+to multiple tasks, such as classification and segmentation. Experiments are
+conducted on both medical image segmentation and computer vision
+classification, with results indicating that ShaSpec outperforms competing
+methods by a large margin. For instance, on BraTS2018, ShaSpec improves the
+SOTA by more than 3% for enhancing tumour, 5% for tumour core and 3% for whole
+tumour.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Memory-Efficient Graph Convolutional Networks for Object Classification
+  and Detection with Event Cameras <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14124v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14124v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kamil Jeziorek, Andrea Pinna, Tomasz Kryjak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in event camera research emphasize processing data in its
+original sparse form, which allows the use of its unique features such as high
+temporal resolution, high dynamic range, low latency, and resistance to image
+blur. One promising approach for analyzing event data is through graph
+convolutional networks (GCNs). However, current research in this domain
+primarily focuses on optimizing computational costs, neglecting the associated
+memory costs. In this paper, we consider both factors together in order to
+achieve satisfying results and relatively low model complexity. For this
+purpose, we performed a comparative analysis of different graph convolution
+operations, considering factors such as execution time, the number of trainable
+model parameters, data format requirements, and training outcomes. Our results
+show a 450-fold reduction in the number of parameters for the feature
+extraction module and a 4.5-fold reduction in the size of the data
+representation while maintaining a classification accuracy of 52.3%, which is
+6.3% higher compared to the operation used in state-of-the-art approaches. To
+further evaluate performance, we implemented the object detection architecture
+and evaluated its performance on the N-Caltech101 dataset. The results showed
+an accuracy of 53.7 % mAP@0.5 and reached an execution rate of 82 graphs per
+second.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for the SPA 2023 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A semantics-driven methodology for high-quality image annotation <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fausto Giunchiglia, Mayukh Bagchi, Xiaolei Diao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work in Machine Learning and Computer Vision has highlighted the
+presence of various types of systematic flaws inside ground truth object
+recognition benchmark datasets. Our basic tenet is that these flaws are rooted
+in the many-to-many mappings which exist between the visual information encoded
+in images and the intended semantics of the labels annotating them. The net
+consequence is that the current annotation process is largely under-specified,
+thus leaving too much freedom to the subjective judgment of annotators. In this
+paper, we propose vTelos, an integrated Natural Language Processing, Knowledge
+Representation, and Computer Vision methodology whose main goal is to make
+explicit the (otherwise implicit) intended annotation semantics, thus
+minimizing the number and role of subjective choices. A key element of vTelos
+is the exploitation of the WordNet lexico-semantic hierarchy as the main means
+for providing the meaning of natural language labels and, as a consequence, for
+driving the annotation of images based on the objects and the visual properties
+they depict. The methodology is validated on images populating a subset of the
+ImageNet hierarchy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted @ 26th European Conference on Artificial Intelligence (ECAI)
+  2023, Krak\'ow, Poland</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Periocular biometrics: databases, algorithms and directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14111v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14111v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernando Alonso-Fernandez, Josef Bigun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Periocular biometrics has been established as an independent modality due to
+concerns on the performance of iris or face systems in uncontrolled conditions.
+Periocular refers to the facial region in the eye vicinity, including eyelids,
+lashes and eyebrows. It is available over a wide range of acquisition
+distances, representing a trade-off between the whole face (which can be
+occluded at close distances) and the iris texture (which do not have enough
+resolution at long distances). Since the periocular region appears in face or
+iris images, it can be used also in conjunction with these modalities. Features
+extracted from the periocular region have been also used successfully for
+gender classification and ethnicity classification, and to study the impact of
+gender transformation or plastic surgery in the recognition performance. This
+paper presents a review of the state of the art in periocular biometric
+research, providing an insight of the most relevant issues and giving a
+thorough coverage of the existing literature. Future research trends are also
+briefly discussed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in: 2016 4th International Conference on Biometrics and
+  Forensics (IWBF). arXiv admin note: substantial text overlap with
+  arXiv:1810.03360</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VideoControlNet: A Motion-Guided Video-to-Video Translation Framework by
+  Using Diffusion Model with ControlNet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14073v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14073v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Hu, Dong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, diffusion models like StableDiffusion have achieved impressive
+image generation results. However, the generation process of such diffusion
+models is uncontrollable, which makes it hard to generate videos with
+continuous and consistent content. In this work, by using the diffusion model
+with ControlNet, we proposed a new motion-guided video-to-video translation
+framework called VideoControlNet to generate various videos based on the given
+prompts and the condition from the input video. Inspired by the video codecs
+that use motion information for reducing temporal redundancy, our framework
+uses motion information to prevent the regeneration of the redundant areas for
+content consistency. Specifically, we generate the first frame (i.e., the
+I-frame) by using the diffusion model with ControlNet. Then we generate other
+key frames (i.e., the P-frame) based on the previous I/P-frame by using our
+newly proposed motion-guided P-frame generation (MgPG) method, in which the
+P-frames are generated based on the motion information and the occlusion areas
+are inpainted by using the diffusion model. Finally, the rest frames (i.e., the
+B-frame) are generated by using our motion-guided B-frame interpolation (MgBI)
+module. Our experiments demonstrate that our proposed VideoControlNet inherits
+the generation capability of the pre-trained large diffusion model and extends
+the image diffusion model to the video diffusion model by using motion
+information. More results are provided at our project page.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty Guided Adaptive Warping for Robust and Efficient Stereo
+  Matching <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junpeng Jing, Jiankun Li, Pengfei Xiong, Jiangyu Liu, Shuaicheng Liu, Yichen Guo, Xin Deng, Mai Xu, Lai Jiang, Leonid Sigal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Correlation based stereo matching has achieved outstanding performance, which
+pursues cost volume between two feature maps. Unfortunately, current methods
+with a fixed model do not work uniformly well across various datasets, greatly
+limiting their real-world applicability. To tackle this issue, this paper
+proposes a new perspective to dynamically calculate correlation for robust
+stereo matching. A novel Uncertainty Guided Adaptive Correlation (UGAC) module
+is introduced to robustly adapt the same model for different scenarios.
+Specifically, a variance-based uncertainty estimation is employed to adaptively
+adjust the sampling area during warping operation. Additionally, we improve the
+traditional non-parametric warping with learnable parameters, such that the
+position-specific weights can be learned. We show that by empowering the
+recurrent network with the UGAC module, stereo matching can be exploited more
+robustly and effectively. Extensive experiments demonstrate that our method
+achieves state-of-the-art performance over the ETH3D, KITTI, and Middlebury
+datasets when employing the same fixed model over these datasets without any
+retraining procedure. To target real-time applications, we further design a
+lightweight model based on UGAC, which also outperforms other methods over
+KITTI benchmarks with only 0.6 M parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PNT-Edge: Towards Robust Edge Detection with Noisy Labels by Learning
+  Pixel-level Noise Transitions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenjie Xuan, Shanshan Zhao, Yu Yao, Juhua Liu, Tongliang Liu, Yixin Chen, Bo Du, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Relying on large-scale training data with pixel-level labels, previous edge
+detection methods have achieved high performance. However, it is hard to
+manually label edges accurately, especially for large datasets, and thus the
+datasets inevitably contain noisy labels. This label-noise issue has been
+studied extensively for classification, while still remaining under-explored
+for edge detection. To address the label-noise issue for edge detection, this
+paper proposes to learn Pixel-level NoiseTransitions to model the
+label-corruption process. To achieve it, we develop a novel Pixel-wise Shift
+Learning (PSL) module to estimate the transition from clean to noisy labels as
+a displacement field. Exploiting the estimated noise transitions, our model,
+named PNT-Edge, is able to fit the prediction to clean labels. In addition, a
+local edge density regularization term is devised to exploit local structure
+information for better transition learning. This term encourages learning large
+shifts for the edges with complex local structures. Experiments on SBD and
+Cityscapes demonstrate the effectiveness of our method in relieving the impact
+of label noise. Codes will be available at github.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pre-Train</span>ing with Diffusion models for Dental Radiography segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14066v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14066v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jérémy Rousseau, Christian Alaka, Emma Covili, Hippolyte Mayard, Laura Misrachi, Willy Au
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical radiography segmentation, and specifically dental radiography, is
+highly limited by the cost of labeling which requires specific expertise and
+labor-intensive annotations. In this work, we propose a straightforward
+pre-training method for semantic segmentation leveraging Denoising Diffusion
+Probabilistic Models (DDPM), which have shown impressive results for generative
+modeling. Our straightforward approach achieves remarkable performance in terms
+of label efficiency and does not require architectural modifications between
+pre-training and downstream tasks. We propose to first pre-train a Unet by
+exploiting the DDPM training objective, and then fine-tune the resulting model
+on a segmentation task. Our experimental results on the segmentation of dental
+radiographs demonstrate that the proposed method is competitive with
+state-of-the-art pre-training methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures, Deep Generative Models workshop @ MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ECO: Ensembling Context Optimization for Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Agnolucci, Alberto Baldrati, Francesco Todino, Federico Becattini, Marco Bertini, Alberto Del Bimbo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image recognition has recently witnessed a paradigm shift, where
+vision-language models are now used to perform few-shot classification based on
+textual prompts. Among these, the CLIP model has shown remarkable capabilities
+for zero-shot transfer by matching an image and a custom textual prompt in its
+latent space. This has paved the way for several works that focus on
+engineering or learning textual contexts for maximizing CLIP's classification
+capabilities. In this paper, we follow this trend by learning an ensemble of
+prompts for image classification. We show that learning diverse and possibly
+shorter contexts improves considerably and consistently the results rather than
+relying on a single trainable prompt. In particular, we report better few-shot
+capabilities with no additional cost at inference time. We demonstrate the
+capabilities of our approach on 11 different benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Set-level Guidance Attack: Boosting Adversarial Transferability of
+  Vision-Language <span class="highlight-title">Pre-train</span>ing Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Lu, Zhiqiang Wang, Teng Wang, Weili Guan, Hongchang Gao, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language pre-training (VLP) models have shown vulnerability to
+adversarial examples in multimodal tasks. Furthermore, malicious adversaries
+can be deliberately transferred to attack other black-box models. However,
+existing work has mainly focused on investigating white-box attacks. In this
+paper, we present the first study to investigate the adversarial
+transferability of recent VLP models. We observe that existing methods exhibit
+much lower transferability, compared to the strong attack performance in
+white-box settings. The transferability degradation is partly caused by the
+under-utilization of cross-modal interactions. Particularly, unlike unimodal
+learning, VLP models rely heavily on cross-modal interactions and the
+multimodal alignments are many-to-many, e.g., an image can be described in
+various natural languages. To this end, we propose a highly transferable
+Set-level Guidance Attack (SGA) that thoroughly leverages modality interactions
+and incorporates alignment-preserving augmentation with cross-modal guidance.
+Experimental results demonstrate that SGA could generate adversarial examples
+that can strongly transfer across different VLP models on multiple downstream
+vision-language tasks. On image-text retrieval, SGA significantly enhances the
+attack success rate for transfer attacks from ALBEF to TCL by a large margin
+(at least 9.78% and up to 30.21%), compared to the state-of-the-art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Establishing Systematic Classification Requirements for
+  Automated Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ken T. Mori, Trent Brown, Steven Peters
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the presence of the classification task in many different benchmark
+datasets for perception in the automotive domain, few efforts have been
+undertaken to define consistent classification requirements. This work
+addresses the topic by proposing a structured method to generate a
+classification structure. First, legal categories are identified based on
+behavioral requirements for the vehicle. This structure is further
+substantiated by considering the two aspects of collision safety for objects as
+well as perceptual categories. A classification hierarchy is obtained by
+applying the method to an exemplary legal text. A comparison of the results
+with benchmark dataset categories shows limited agreement. This indicates the
+necessity for explicit consideration of legal requirements regarding
+perception.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE IV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unite-Divide-Unite: Joint Boosting Trunk and Structure for High-accuracy
+  Dichotomous Image Segmentation <span class="chip">ACM MM2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialun Pei, Zhangjun Zhou, Yueming Jin, He Tang, Pheng-Ann Heng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-accuracy Dichotomous Image Segmentation (DIS) aims to pinpoint
+category-agnostic foreground objects from natural scenes. The main challenge
+for DIS involves identifying the highly accurate dominant area while rendering
+detailed object structure. However, directly using a general encoder-decoder
+architecture may result in an oversupply of high-level features and neglect the
+shallow spatial information necessary for partitioning meticulous structures.
+To fill this gap, we introduce a novel Unite-Divide-Unite Network (UDUN} that
+restructures and bipartitely arranges complementary features to simultaneously
+boost the effectiveness of trunk and structure identification. The proposed
+UDUN proceeds from several strengths. First, a dual-size input feeds into the
+shared backbone to produce more holistic and detailed features while keeping
+the model lightweight. Second, a simple Divide-and-Conquer Module (DCM) is
+proposed to decouple multiscale low- and high-level features into our structure
+decoder and trunk decoder to obtain structure and trunk information
+respectively. Moreover, we design a Trunk-Structure Aggregation module (TSA) in
+our union decoder that performs cascade integration for uniform high-accuracy
+segmentation. As a result, UDUN performs favorably against state-of-the-art
+competitors in all six evaluation metrics on overall DIS-TE, i.e., achieving
+0.772 weighted F-measure and 977 HCE. Using 1024*1024 input, our model enables
+real-time inference at 65.3 fps with ResNet-18.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by ACM MM2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Semantic Subspace Traverser: Empowering 3D Generative Model with
+  Shape Editing Capability <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruowei Wang, Yu Liu, Pei Su, Jianwei Zhang, Qijun Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Shape generation is the practice of producing 3D shapes as various
+representations for 3D content creation. Previous studies on 3D shape
+generation have focused on shape quality and structure, without or less
+considering the importance of semantic information. Consequently, such
+generative models often fail to preserve the semantic consistency of shape
+structure or enable manipulation of the semantic attributes of shapes during
+generation. In this paper, we proposed a novel semantic generative model named
+3D Semantic Subspace Traverser that utilizes semantic attributes for
+category-specific 3D shape generation and editing. Our method utilizes implicit
+functions as the 3D shape representation and combines a novel latent-space GAN
+with a linear subspace model to discover semantic dimensions in the local
+latent space of 3D shapes. Each dimension of the subspace corresponds to a
+particular semantic attribute, and we can edit the attributes of generated
+shapes by traversing the coefficients of those dimensions. Experimental results
+demonstrate that our method can produce plausible shapes with complex
+structures and enable the editing of semantic attributes. The code and trained
+models are available at
+https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in ICCV 2023. Code:
+  https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Controllable Guide-Space for Generalizable Face Forgery Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ying Guo, Cheng Zhen, Pengfei Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies on face forgery detection have shown satisfactory performance
+for methods involved in training datasets, but are not ideal enough for unknown
+domains. This motivates many works to improve the generalization, but
+forgery-irrelevant information, such as image background and identity, still
+exists in different domain features and causes unexpected clustering, limiting
+the generalization. In this paper, we propose a controllable guide-space (GS)
+method to enhance the discrimination of different forgery domains, so as to
+increase the forgery relevance of features and thereby improve the
+generalization. The well-designed guide-space can simultaneously achieve both
+the proper separation of forgery domains and the large distance between
+real-forgery domains in an explicit and controllable manner. Moreover, for
+better discrimination, we use a decoupling module to weaken the interference of
+forgery-irrelevant correlations between domains. Furthermore, we make
+adjustments to the decision boundary manifold according to the clustering
+degree of the same domain features within the neighborhood. Extensive
+experiments in multiple in-domain and cross-domain settings confirm that our
+method can achieve state-of-the-art generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Consensus-Adaptive RANSAC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14030v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14030v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Cavalli, Daniel Barath, Marc Pollefeys, Viktor Larsson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  RANSAC and its variants are widely used for robust estimation, however, they
+commonly follow a greedy approach to finding the highest scoring model while
+ignoring other model hypotheses. In contrast, Iteratively Reweighted Least
+Squares (IRLS) techniques gradually approach the model by iteratively updating
+the weight of each correspondence based on the residuals from previous
+iterations. Inspired by these methods, we propose a new RANSAC framework that
+learns to explore the parameter space by considering the residuals seen so far
+via a novel attention layer. The attention mechanism operates on a batch of
+point-to-model residuals, and updates a per-point estimation state to take into
+account the consensus found through a lightweight one-step transformer. This
+rich state then guides the minimal sampling between iterations as well as the
+model refinement. We evaluate the proposed approach on essential and
+fundamental matrix estimation on a number of indoor and outdoor datasets. It
+outperforms state-of-the-art estimators by a significant margin adding only a
+small runtime overhead. Moreover, we demonstrate good generalization properties
+of our trained model, indicating its effectiveness across different datasets
+and tasks. The proposed attention mechanism and one-step transformer provide an
+adaptive behavior that enhances the performance of RANSAC, making it a more
+effective tool for robust estimation. Code is available at
+https://github.com/cavalli1234/CA-RANSAC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topologically-Regularized Multiple Instance Learning for Red Blood Cell
+  Disease Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Salome Kazeminia, Ario Sadafi, Asya Makhro, Anna Bogdanova, Carsten Marr, Bastian Rieck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diagnosing rare anemia disorders using microscopic images is challenging for
+skilled specialists and machine-learning methods alike. Due to thousands of
+disease-relevant cells in a single blood sample, this constitutes a complex
+multiple-instance learning (MIL) problem. While the spatial neighborhood of red
+blood cells is not meaningful per se, the topology, i.e., the geometry of blood
+samples as a whole, contains informative features to remedy typical MIL issues,
+such as vanishing gradients and overfitting when training on limited data. We
+thus develop a topology-based approach that extracts multi-scale topological
+features from bags of single red blood cell images. The topological features
+are used to regularize the model, enforcing the preservation of characteristic
+topological properties of the data. Applied to a dataset of 71 patients
+suffering from rare anemia disorders with 521 microscopic images of red blood
+cells, our experiments show that topological regularization is an effective
+method that leads to more than 3% performance improvements for the automated
+classification of rare anemia disorders based on single-cell images. This is
+the first approach that uses topological properties for regularizing the MIL
+process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retinotopy Inspired Brain Encoding Model and the All-for-One Training
+  Recipe 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14021v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14021v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huzheng Yang, Jianbo Shi, James Gee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain encoding models aim to predict brain voxel-wise responses to stimuli
+images, replicating brain signals captured by neuroimaging techniques. There is
+a large volume of publicly available data, but training a comprehensive brain
+encoding model is challenging. The main difficulties stem from a) diversity
+within individual brain, with functional heterogeneous brain regions; b)
+diversity of brains from different subjects, due to genetic and developmental
+differences; c) diversity of imaging modalities and processing pipelines. We
+use this diversity to our advantage by introducing the All-for-One training
+recipe, which divides the challenging one-big-model problem into multiple small
+models, with the small models aggregating the knowledge while preserving the
+distinction between the different functional regions. Agnostic of the training
+recipe, we use biological knowledge of the brain, specifically retinotopy, to
+introduce inductive bias to learn a 3D brain-to-image mapping that ensures a)
+each neuron knows which image regions and semantic levels to gather
+information, and b) no neurons are left behind in the model.
+  We pre-trained a brain encoding model using over one million data points from
+five public datasets spanning three imaging modalities. To the best of our
+knowledge, this is the most comprehensive brain encoding model to the date. We
+demonstrate the effectiveness of the pre-trained model as a drop-in replacement
+for commonly used vision backbone models. Furthermore, we demonstrate the
+application of the model to brain decoding. Code and the model checkpoint will
+be made available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ One-Nearest Neighborhood Guides Inlier Estimation for Unsupervised Point
+  Cloud Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongzhe Yuan, Yue Wu, Maoguo Gong, Qiguang Miao, A. K. Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The precision of unsupervised point cloud registration methods is typically
+limited by the lack of reliable inlier estimation and self-supervised signal,
+especially in partially overlapping scenarios. In this paper, we propose an
+effective inlier estimation method for unsupervised point cloud registration by
+capturing geometric structure consistency between the source point cloud and
+its corresponding reference point cloud copy. Specifically, to obtain a high
+quality reference point cloud copy, an One-Nearest Neighborhood (1-NN) point
+cloud is generated by input point cloud. This facilitates matching map
+construction and allows for integrating dual neighborhood matching scores of
+1-NN point cloud and input point cloud to improve matching confidence.
+Benefiting from the high quality reference copy, we argue that the neighborhood
+graph formed by inlier and its neighborhood should have consistency between
+source point cloud and its corresponding reference copy. Based on this
+observation, we construct transformation-invariant geometric structure
+representations and capture geometric structure consistency to score the inlier
+confidence for estimated correspondences between source point cloud and its
+reference copy. This strategy can simultaneously provide the reliable
+self-supervised signal for model optimization. Finally, we further calculate
+transformation estimation by the weighted SVD algorithm with the estimated
+correspondences and corresponding inlier confidence. We train the proposed
+model in an unsupervised manner, and extensive experiments on synthetic and
+real-world datasets illustrate the effectiveness of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RPG-Palm: Realistic Pseudo-data Generation for Palmprint Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Shen, Jianlong Jin, Ruixin Zhang, Huaen Li, Yingyi Zhang, Jingyun Zhang, Shouhong Ding, Yang Zhao, Wei Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Palmprint recently shows great potential in recognition applications as it is
+a privacy-friendly and stable biometric. However, the lack of large-scale
+public palmprint datasets limits further research and development of palmprint
+recognition. In this paper, we propose a novel realistic pseudo-palmprint
+generation (RPG) model to synthesize palmprints with massive identities. We
+first introduce a conditional modulation generator to improve the intra-class
+diversity. Then an identity-aware loss is proposed to ensure identity
+consistency against unpaired training. We further improve the B\'ezier palm
+creases generation strategy to guarantee identity independence. Extensive
+experimental results demonstrate that synthetic pretraining significantly
+boosts the recognition model performance. For example, our model improves the
+state-of-the-art B\'ezierPalm by more than $5\%$ and $14\%$ in terms of
+TAR@FAR=1e-6 under the $1:1$ and $1:3$ Open-set protocol. When accessing only
+$10\%$ of the real training data, our method still outperforms ArcFace with
+$100\%$ real training data, indicating that we are closer to real-data-free
+palmprint recognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages,8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ESSAformer: Efficient <span class="highlight-title">Transformer</span> for Hyperspectral Image
+  Super-resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14010v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14010v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingjin Zhang, Chi Zhang, Qiming Zhang, Jie Guo, Xinbo Gao, Jing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single hyperspectral image super-resolution (single-HSI-SR) aims to restore a
+high-resolution hyperspectral image from a low-resolution observation. However,
+the prevailing CNN-based approaches have shown limitations in building
+long-range dependencies and capturing interaction information between spectral
+features. This results in inadequate utilization of spectral information and
+artifacts after upsampling. To address this issue, we propose ESSAformer, an
+ESSA attention-embedded Transformer network for single-HSI-SR with an iterative
+refining structure. Specifically, we first introduce a robust and
+spectral-friendly similarity metric, \ie, the spectral correlation coefficient
+of the spectrum (SCC), to replace the original attention matrix and
+incorporates inductive biases into the model to facilitate training. Built upon
+it, we further utilize the kernelizable attention technique with theoretical
+support to form a novel efficient SCC-kernel-based self-attention (ESSA) and
+reduce attention computation to linear complexity. ESSA enlarges the receptive
+field for features after upsampling without bringing much computation and
+allows the model to effectively utilize spatial-spectral information from
+different scales, resulting in the generation of more natural high-resolution
+images. Without the need for pretraining on large-scale datasets, our
+experiments demonstrate ESSA's effectiveness in both visual quality and
+quantitative results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Car-Studio: Learning Car Radiance Fields from Single-View and Endless
+  In-the-wild Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14009v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14009v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyu Liu, Hao Zhao, Yang Yu, Guyue Zhou, Ming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compositional neural scene graph studies have shown that radiance fields can
+be an efficient tool in an editable autonomous driving simulator. However,
+previous studies learned within a sequence of autonomous driving datasets,
+resulting in unsatisfactory blurring when rotating the car in the simulator. In
+this letter, we propose a pipeline for learning unconstrained images and
+building a dataset from processed images. To meet the requirements of the
+simulator, which demands that the vehicle maintain clarity when the perspective
+changes and that the contour remains sharp from the background to avoid
+artifacts when editing, we design a radiation field of the vehicle, a crucial
+part of the urban scene foreground. Through experiments, we demonstrate that
+our model achieves competitive performance compared to baselines. Using the
+datasets built from in-the-wild images, our method gradually presents a
+controllable appearance editing function. We will release the dataset and code
+on https://lty2226262.github.io/car-studio/ to facilitate further research in
+the field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submissions to the IEEE Robotics and Automation Letters (RA-L),
+  Project Page: https://lty2226262.github.io/car-studio/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Frequency Filters As Efficient Global Token Mixers <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14008v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14008v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Huang, Zhizheng Zhang, Cuiling Lan, Zheng-Jun Zha, Yan Lu, Baining Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent vision transformers, large-kernel CNNs and MLPs have attained
+remarkable successes in broad vision tasks thanks to their effective
+information fusion in the global scope. However, their efficient deployments,
+especially on mobile devices, still suffer from noteworthy challenges due to
+the heavy computational costs of self-attention mechanisms, large kernels, or
+fully connected layers. In this work, we apply conventional convolution theorem
+to deep learning for addressing this and reveal that adaptive frequency filters
+can serve as efficient global token mixers. With this insight, we propose
+Adaptive Frequency Filtering (AFF) token mixer. This neural operator transfers
+a latent representation to the frequency domain via a Fourier transform and
+performs semantic-adaptive frequency filtering via an elementwise
+multiplication, which mathematically equals to a token mixing operation in the
+original latent space with a dynamic convolution kernel as large as the spatial
+resolution of this latent representation. We take AFF token mixers as primary
+neural operators to build a lightweight neural network, dubbed AFFNet.
+Extensive experiments demonstrate the effectiveness of our proposed AFF token
+mixer and show that AFFNet achieve superior accuracy and efficiency trade-offs
+compared to other lightweight network designs on broad visual tasks, including
+visual recognition and dense prediction tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Snippet-to-Motion Progression for Skeleton-based Human Motion
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14006v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14006v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinshun Wang, Qiongjie Cui, Chen Chen, Shen Zhao, Mengyuan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing Graph Convolutional Networks to achieve human motion prediction
+largely adopt a one-step scheme, which output the prediction straight from
+history input, failing to exploit human motion patterns. We observe that human
+motions have transitional patterns and can be split into snippets
+representative of each transition. Each snippet can be reconstructed from its
+starting and ending poses referred to as the transitional poses. We propose a
+snippet-to-motion multi-stage framework that breaks motion prediction into
+sub-tasks easier to accomplish. Each sub-task integrates three modules:
+transitional pose prediction, snippet reconstruction, and snippet-to-motion
+prediction. Specifically, we propose to first predict only the transitional
+poses. Then we use them to reconstruct the corresponding snippets, obtaining a
+close approximation to the true motion sequence. Finally we refine them to
+produce the final prediction output. To implement the network, we propose a
+novel unified graph modeling, which allows for direct and effective feature
+propagation compared to existing approaches which rely on separate space-time
+modeling. Extensive experiments on Human 3.6M, CMU Mocap and 3DPW datasets
+verify the effectiveness of our method which achieves state-of-the-art
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal reasoning in typical computer vision tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13992v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13992v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                         Zhang,  Kexuan,  Sun,  Qiyu,  Zhao,  Chaoqiang,  Tang,  Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has revolutionized the field of artificial intelligence. Based
+on the statistical correlations uncovered by deep learning-based methods,
+computer vision technology has contributed to tremendous growth in areas such
+as autonomous driving and robotics. Despite being the basis of deep learning,
+such correlation is not stable and is susceptible to uncontrolled factors. In
+the absence of the guidance of prior knowledge, statistical correlations can
+easily turn into spurious correlations and cause confounders. As a result,
+researchers are beginning to refine deep learning-based methods with causal
+theory. Causal theory models the intrinsic causal structure unaffected by data
+bias and is effective in avoiding spurious correlations. This paper aims to
+comprehensively review the existing causal methods in typical vision and
+vision-language tasks such as semantic segmentation, object detection, and
+image captioning. The advantages of causality and the approaches for building
+causal paradigms will be summarized. Future roadmaps are also proposed,
+including facilitating the development of causal theory and its application in
+other complex scenes and systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ METAVerse: Meta-Learning Traversability Cost Map for Off-Road Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junwon Seo, Taekyung Kim, Seongyong Ahn, Kiho Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous navigation in off-road conditions requires an accurate estimation
+of terrain traversability. However, traversability estimation in unstructured
+environments is subject to high uncertainty due to the variability of numerous
+factors that influence vehicle-terrain interaction. Consequently, it is
+challenging to obtain a generalizable model that can accurately predict
+traversability in a variety of environments. This paper presents METAVerse, a
+meta-learning framework for learning a global model that accurately and
+reliably predicts terrain traversability across diverse environments. We train
+the traversability prediction network to generate a dense and continuous-valued
+cost map from a sparse LiDAR point cloud, leveraging vehicle-terrain
+interaction feedback in a self-supervised manner. Meta-learning is utilized to
+train a global model with driving data collected from multiple environments,
+effectively minimizing estimation uncertainty. During deployment, online
+adaptation is performed to rapidly adapt the network to the local environment
+by exploiting recent interaction experiences. To conduct a comprehensive
+evaluation, we collect driving data from various terrains and demonstrate that
+our method can obtain a global model that minimizes uncertainty. Moreover, by
+integrating our model with a model predictive controller, we demonstrate that
+the reduced uncertainty results in safe and stable navigation in unstructured
+and unknown terrains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our video can be found at https://youtu.be/4rIAMM1ZKMo</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid Representation-Enhanced Sampling for Bayesian Active Learning in
+  Musculoskeletal Segmentation of Lower Extremities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ganping Li, Yoshito Otake, Mazen Soufi, Masashi Taniguchi, Masahide Yagi, Noriaki Ichihashi, Keisuke Uemura, Masaki Takao, Nobuhiko Sugano, Yoshinobu Sato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Obtaining manual annotations to train deep learning (DL) models for
+auto-segmentation is often time-consuming. Uncertainty-based Bayesian active
+learning (BAL) is a widely-adopted method to reduce annotation efforts. Based
+on BAL, this study introduces a hybrid representation-enhanced sampling
+strategy that integrates density and diversity criteria to save manual
+annotation costs by efficiently selecting the most informative samples.
+  Methods: The experiments are performed on two lower extremity (LE) datasets
+of MRI and CT images by a BAL framework based on Bayesian U-net. Our method
+selects uncertain samples with high density and diversity for manual revision,
+optimizing for maximal similarity to unlabeled instances and minimal similarity
+to existing training data. We assess the accuracy and efficiency using Dice and
+a proposed metric called reduced annotation cost (RAC), respectively. We
+further evaluate the impact of various acquisition rules on BAL performance and
+design an ablation study for effectiveness estimation.
+  Results: The proposed method showed superiority or non-inferiority to other
+methods on both datasets across two acquisition rules, and quantitative results
+reveal the pros and cons of the acquisition rules. Our ablation study in
+volume-wise acquisition shows that the combination of density and diversity
+criteria outperforms solely using either of them in musculoskeletal
+segmentation.
+  Conclusion: Our sampling method is proven efficient in reducing annotation
+costs in image segmentation tasks. The combination of the proposed method and
+our BAL framework provides a semi-automatic way for efficient annotation of
+medical image datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced Security against Adversarial Examples Using a Random Ensemble
+  of Encrypted Vision <span class="highlight-title">Transformer</span> Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryota Iijima, Miki Tanaka, Sayaka Shiota, Hitoshi Kiya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) are well known to be vulnerable to adversarial
+examples (AEs). In addition, AEs have adversarial transferability, which means
+AEs generated for a source model can fool another black-box model (target
+model) with a non-trivial probability. In previous studies, it was confirmed
+that the vision transformer (ViT) is more robust against the property of
+adversarial transferability than convolutional neural network (CNN) models such
+as ConvMixer, and moreover encrypted ViT is more robust than ViT without any
+encryption. In this article, we propose a random ensemble of encrypted ViT
+models to achieve much more robust models. In experiments, the proposed scheme
+is verified to be more robust against not only black-box attacks but also
+white-box ones than convention methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis of Video Quality <span class="highlight-title">Dataset</span>s via Design of Minimalistic Video
+  Quality Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Sun, Wen Wen, Xiongkuo Min, Long Lan, Guangtao Zhai, Kede Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blind video quality assessment (BVQA) plays an indispensable role in
+monitoring and improving the end-users' viewing experience in various
+real-world video-enabled media applications. As an experimental field, the
+improvements of BVQA models have been measured primarily on a few human-rated
+VQA datasets. Thus, it is crucial to gain a better understanding of existing
+VQA datasets in order to properly evaluate the current progress in BVQA.
+Towards this goal, we conduct a first-of-its-kind computational analysis of VQA
+datasets via designing minimalistic BVQA models. By minimalistic, we restrict
+our family of BVQA models to build only upon basic blocks: a video preprocessor
+(for aggressive spatiotemporal downsampling), a spatial quality analyzer, an
+optional temporal quality analyzer, and a quality regressor, all with the
+simplest possible instantiations. By comparing the quality prediction
+performance of different model variants on eight VQA datasets with realistic
+distortions, we find that nearly all datasets suffer from the easy dataset
+problem of varying severity, some of which even admit blind image quality
+assessment (BIQA) solutions. We additionally justify our claims by contrasting
+our model generalizability on these VQA datasets, and by ablating a dizzying
+set of BVQA design choices related to the basic building blocks. Our results
+cast doubt on the current progress in BVQA, and meanwhile shed light on good
+practices of constructing next-generation VQA datasets and models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Controlling the Latent Space of GANs through Reinforcement Learning: A
+  Case Study on Task-based Image-to-Image Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahyar Abbasian, Taha Rajabzadeh, Ahmadreza Moradipari, Seyed Amir Hossein Aqajari, Hongsheng Lu, Amir Rahmani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GAN) have emerged as a formidable AI tool to
+generate realistic outputs based on training datasets. However, the challenge
+of exerting control over the generation process of GANs remains a significant
+hurdle. In this paper, we propose a novel methodology to address this issue by
+integrating a reinforcement learning (RL) agent with a latent-space GAN
+(l-GAN), thereby facilitating the generation of desired outputs. More
+specifically, we have developed an actor-critic RL agent with a meticulously
+designed reward policy, enabling it to acquire proficiency in navigating the
+latent space of the l-GAN and generating outputs based on specified tasks. To
+substantiate the efficacy of our approach, we have conducted a series of
+experiments employing the MNIST dataset, including arithmetic addition as an
+illustrative task. The outcomes of these experiments serve to validate our
+methodology. Our pioneering integration of an RL agent with a GAN model
+represents a novel advancement, holding great potential for enhancing
+generative networks in the future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 7 figures, 2 tables, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tracking Anything in High Quality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13974v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13974v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawen Zhu, Zhenyu Chen, Zeqi Hao, Shijie Chang, Lu Zhang, Dong Wang, Huchuan Lu, Bin Luo, Jun-Yan He, Jin-Peng Lan, Hanyuan Chen, Chenyang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual object tracking is a fundamental video task in computer vision.
+Recently, the notably increasing power of perception algorithms allows the
+unification of single/multiobject and box/mask-based tracking. Among them, the
+Segment Anything Model (SAM) attracts much attention. In this report, we
+propose HQTrack, a framework for High Quality Tracking anything in videos.
+HQTrack mainly consists of a video multi-object segmenter (VMOS) and a mask
+refiner (MR). Given the object to be tracked in the initial frame of a video,
+VMOS propagates the object masks to the current frame. The mask results at this
+stage are not accurate enough since VMOS is trained on several closeset video
+object segmentation (VOS) datasets, which has limited ability to generalize to
+complex and corner scenes. To further improve the quality of tracking masks, a
+pretrained MR model is employed to refine the tracking results. As a compelling
+testament to the effectiveness of our paradigm, without employing any tricks
+such as test-time data augmentations and model ensemble, HQTrack ranks the 2nd
+place in the Visual Object Tracking and Segmentation (VOTS2023) challenge. Code
+and models are available at https://github.com/jiawen-zhu/HQTrack.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual <span class="highlight-title">Prompt</span> Flexible-Modal Face Anti-Spoofing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zitong Yu, Rizhao Cai, Yawen Cui, Ajian Liu, Changsheng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, vision transformer based multimodal learning methods have been
+proposed to improve the robustness of face anti-spoofing (FAS) systems.
+However, multimodal face data collected from the real world is often imperfect
+due to missing modalities from various imaging sensors. Recently,
+flexible-modal FAS~\cite{yu2023flexible} has attracted more attention, which
+aims to develop a unified multimodal FAS model using complete multimodal face
+data but is insensitive to test-time missing modalities. In this paper, we
+tackle one main challenge in flexible-modal FAS, i.e., when missing modality
+occurs either during training or testing in real-world situations. Inspired by
+the recent success of the prompt learning in language models, we propose
+\textbf{V}isual \textbf{P}rompt flexible-modal \textbf{FAS} (VP-FAS), which
+learns the modal-relevant prompts to adapt the frozen pre-trained foundation
+model to downstream flexible-modal FAS task. Specifically, both vanilla visual
+prompts and residual contextual prompts are plugged into multimodal
+transformers to handle general missing-modality cases, while only requiring
+less than 4\% learnable parameters compared to training the entire model.
+Furthermore, missing-modality regularization is proposed to force models to
+learn consistent multimodal feature embeddings when missing partial modalities.
+Extensive experiments conducted on two multimodal FAS benchmark datasets
+demonstrate the effectiveness of our VP-FAS framework that improves the
+performance under various missing-modality cases while alleviating the
+requirement of heavy model re-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2303.03369 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Heterogeneous Embodied Multi-Agent Collaboration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinzhu Liu, Di Guo, Huaping Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent embodied tasks have recently been studied in complex indoor
+visual environments. Collaboration among multiple agents can improve work
+efficiency and has significant practical value. However, most of the existing
+research focuses on homogeneous multi-agent tasks. Compared with homogeneous
+agents, heterogeneous agents can leverage their different capabilities to
+allocate corresponding sub-tasks and cooperate to complete complex tasks.
+Heterogeneous multi-agent tasks are common in real-world scenarios, and the
+collaboration strategy among heterogeneous agents is a challenging and
+important problem to be solved. To study collaboration among heterogeneous
+agents, we propose the heterogeneous multi-agent tidying-up task, in which
+multiple heterogeneous agents with different capabilities collaborate with each
+other to detect misplaced objects and place them in reasonable locations. This
+is a demanding task since it requires agents to make the best use of their
+different capabilities to conduct reasonable task planning and complete the
+whole task. To solve this task, we build a heterogeneous multi-agent tidying-up
+benchmark dataset in a large number of houses with multiple rooms based on
+ProcTHOR-10K. We propose the hierarchical decision model based on misplaced
+object detection, reasonable receptacle prediction, as well as the
+handshake-based group communication mechanism. Extensive experiments are
+conducted to demonstrate the effectiveness of the proposed model. The project's
+website and videos of experiments can be found at https://hetercol.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Hidden Dance of Phonemes and Visage: Unveiling the Enigmatic Link
+  between Phonemes and Facial Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13953v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13953v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liao Qu, Xianwei Zou, Xiang Li, Yandong Wen, Rita Singh, Bhiksha Raj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work unveils the enigmatic link between phonemes and facial features.
+Traditional studies on voice-face correlations typically involve using a long
+period of voice input, including generating face images from voices and
+reconstructing 3D face meshes from voices. However, in situations like
+voice-based crimes, the available voice evidence may be short and limited.
+Additionally, from a physiological perspective, each segment of speech --
+phoneme -- corresponds to different types of airflow and movements in the face.
+Therefore, it is advantageous to discover the hidden link between phonemes and
+face attributes. In this paper, we propose an analysis pipeline to help us
+explore the voice-face relationship in a fine-grained manner, i.e., phonemes
+v.s. facial anthropometric measurements (AM). We build an estimator for each
+phoneme-AM pair and evaluate the correlation through hypothesis testing. Our
+results indicate that AMs are more predictable from vowels compared to
+consonants, particularly with plosives. Additionally, we observe that if a
+specific AM exhibits more movement during phoneme pronunciation, it is more
+predictable. Our findings support those in physiology regarding correlation and
+lay the groundwork for future research on speech-face multimodal learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Interspeech 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Voice-Face Correlation: A Geometry View 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13948v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13948v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Li, Yandong Wen, Muqiao Yang, Jinglu Wang, Rita Singh, Bhiksha Raj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous works on voice-face matching and voice-guided face synthesis
+demonstrate strong correlations between voice and face, but mainly rely on
+coarse semantic cues such as gender, age, and emotion. In this paper, we aim to
+investigate the capability of reconstructing the 3D facial shape from voice
+from a geometry perspective without any semantic information. We propose a
+voice-anthropometric measurement (AM)-face paradigm, which identifies
+predictable facial AMs from the voice and uses them to guide 3D face
+reconstruction. By leveraging AMs as a proxy to link the voice and face
+geometry, we can eliminate the influence of unpredictable AMs and make the face
+geometry tractable. Our approach is evaluated on our proposed dataset with
+ground-truth 3D face scans and corresponding voice recordings, and we find
+significant correlations between voice and specific parts of the face geometry,
+such as the nasal cavity and cranium. Our work offers a new perspective on
+voice-face correlation and can serve as a good empirical study for
+anthropometry science.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Centroid-aware feature recalibration for cancer grading in pathology
+  images <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaeung Lee, Keunho Byeon, Jin Tae Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cancer grading is an essential task in pathology. The recent developments of
+artificial neural networks in computational pathology have shown that these
+methods hold great potential for improving the accuracy and quality of cancer
+diagnosis. However, the issues with the robustness and reliability of such
+methods have not been fully resolved yet. Herein, we propose a centroid-aware
+feature recalibration network that can conduct cancer grading in an accurate
+and robust manner. The proposed network maps an input pathology image into an
+embedding space and adjusts it by using centroids embedding vectors of
+different cancer grades via attention mechanism. Equipped with the recalibrated
+embedding vector, the proposed network classifiers the input pathology image
+into a pertinent class label, i.e., cancer grade. We evaluate the proposed
+network using colorectal cancer datasets that were collected under different
+environments. The experimental results confirm that the proposed network is
+able to conduct cancer grading in pathology images with high accuracy
+regardless of the environmental changes in the datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023; 10 pages; 1 figure; Project code:
+  https://github.com/colin19950703/CaFeNet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Semi-Supervised Semantic Segmentation with Dual-Level Siamese
+  Structure Network <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhibo Tain, Xiaolin Zhang, Peng Zhang, Kun Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised semantic segmentation (SSS) is an important task that
+utilizes both labeled and unlabeled data to reduce expenses on labeling
+training examples. However, the effectiveness of SSS algorithms is limited by
+the difficulty of fully exploiting the potential of unlabeled data. To address
+this, we propose a dual-level Siamese structure network (DSSN) for pixel-wise
+contrastive learning. By aligning positive pairs with a pixel-wise contrastive
+loss using strong augmented views in both low-level image space and high-level
+feature space, the proposed DSSN is designed to maximize the utilization of
+available unlabeled data. Additionally, we introduce a novel class-aware
+pseudo-label selection strategy for weak-to-strong supervision, which addresses
+the limitations of most existing methods that do not perform selection or apply
+a predefined threshold for all classes. Specifically, our strategy selects the
+top high-confidence prediction of the weak view for each class to generate
+pseudo labels that supervise the strong augmented views. This strategy is
+capable of taking into account the class imbalance and improving the
+performance of long-tailed classes. Our proposed method achieves
+state-of-the-art results on two datasets, PASCAL VOC 2012 and Cityscapes,
+outperforming other SSS algorithms by a significant margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023 accpeted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AIDE: A Vision-Driven Multi-View, Multi-Modal, Multi-Tasking <span class="highlight-title">Dataset</span> for
+  Assistive Driving Perception <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13933v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13933v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dingkang Yang, Shuai Huang, Zhi Xu, Zhenpeng Li, Shunli Wang, Mingcheng Li, Yuzheng Wang, Yang Liu, Kun Yang, Zhaoyu Chen, Yan Wang, Jing Liu, Peixuan Zhang, Peng Zhai, Lihua Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Driver distraction has become a significant cause of severe traffic accidents
+over the past decade. Despite the growing development of vision-driven driver
+monitoring systems, the lack of comprehensive perception datasets restricts
+road safety and traffic security. In this paper, we present an AssIstive
+Driving pErception dataset (AIDE) that considers context information both
+inside and outside the vehicle in naturalistic scenarios. AIDE facilitates
+holistic driver monitoring through three distinctive characteristics, including
+multi-view settings of driver and scene, multi-modal annotations of face, body,
+posture, and gesture, and four pragmatic task designs for driving
+understanding. To thoroughly explore AIDE, we provide experimental benchmarks
+on three kinds of baseline frameworks via extensive methods. Moreover, two
+fusion strategies are introduced to give new insights into learning effective
+multi-stream/modal representations. We also systematically investigate the
+importance and rationality of the key components in AIDE and benchmarks. The
+project link is https://github.com/ydk122024/AIDE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatio-Temporal Domain Awareness for Multi-Agent Collaborative
+  Perception <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kun Yang, Dingkang Yang, Jingyu Zhang, Mingcheng Li, Yang Liu, Jing Liu, Hanqi Wang, Peng Sun, Liang Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent collaborative perception as a potential application for
+vehicle-to-everything communication could significantly improve the perception
+performance of autonomous vehicles over single-agent perception. However,
+several challenges remain in achieving pragmatic information sharing in this
+emerging research. In this paper, we propose SCOPE, a novel collaborative
+perception framework that aggregates the spatio-temporal awareness
+characteristics across on-road agents in an end-to-end manner. Specifically,
+SCOPE has three distinct strengths: i) it considers effective semantic cues of
+the temporal context to enhance current representations of the target agent;
+ii) it aggregates perceptually critical spatial information from heterogeneous
+agents and overcomes localization errors via multi-scale feature interactions;
+iii) it integrates multi-source representations of the target agent based on
+their complementary contributions by an adaptive fusion paradigm. To thoroughly
+evaluate SCOPE, we consider both real-world and simulated scenarios of
+collaborative 3D object detection tasks on three datasets. Extensive
+experiments demonstrate the superiority of our approach and the necessity of
+the proposed components.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DFR-Net: Density Feature Refinement Network for Image Dehazing Utilizing
+  Haze Density Difference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13927v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13927v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongze Wang, Haitao Zhao, Lujian Yao, Jingchao Peng, Kaijie Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In image dehazing task, haze density is a key feature and affects the
+performance of dehazing methods. However, some of the existing methods lack a
+comparative image to measure densities, and others create intermediate results
+but lack the exploitation of their density differences, which can facilitate
+perception of density. To address these deficiencies, we propose a
+density-aware dehazing method named Density Feature Refinement Network
+(DFR-Net) that extracts haze density features from density differences and
+leverages density differences to refine density features. In DFR-Net, we first
+generate a proposal image that has lower overall density than the hazy input,
+bringing in global density differences. Additionally, the dehazing residual of
+the proposal image reflects the level of dehazing performance and provides
+local density differences that indicate localized hard dehazing or high density
+areas. Subsequently, we introduce a Global Branch (GB) and a Local Branch (LB)
+to achieve density-awareness. In GB, we use Siamese networks for feature
+extraction of hazy inputs and proposal images, and we propose a Global Density
+Feature Refinement (GDFR) module that can refine features by pushing features
+with different global densities further away. In LB, we explore local density
+features from the dehazing residuals between hazy inputs and proposal images
+and introduce an Intermediate Dehazing Residual Feedforward (IDRF) module to
+update local features and pull them closer to clear image features. Sufficient
+experiments demonstrate that the proposed method achieves results beyond the
+state-of-the-art methods on various datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EasyNet: An Easy Network for 3D Industrial Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13925v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13925v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruitao Chen, Guoyang Xie, Jiaqi Liu, Jinbao Wang, Ziqi Luo, Jinfan Wang, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D anomaly detection is an emerging and vital computer vision task in
+industrial manufacturing (IM). Recently many advanced algorithms have been
+published, but most of them cannot meet the needs of IM. There are several
+disadvantages: i) difficult to deploy on production lines since their
+algorithms heavily rely on large pre-trained models; ii) hugely increase
+storage overhead due to overuse of memory banks; iii) the inference speed
+cannot be achieved in real-time. To overcome these issues, we propose an easy
+and deployment-friendly network (called EasyNet) without using pre-trained
+models and memory banks: firstly, we design a multi-scale multi-modality
+feature encoder-decoder to accurately reconstruct the segmentation maps of
+anomalous regions and encourage the interaction between RGB images and depth
+images; secondly, we adopt a multi-modality anomaly segmentation network to
+achieve a precise anomaly map; thirdly, we propose an attention-based
+information entropy fusion module for feature fusion during inference, making
+it suitable for real-time deployment. Extensive experiments show that EasyNet
+achieves an anomaly detection AUROC of 92.6% without using pre-trained models
+and memory banks. In addition, EasyNet is faster than existing methods, with a
+high frame rate of 94.55 FPS on a Tesla V100 GPU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ trajdata: A Unified Interface to Multiple Human Trajectory <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boris Ivanovic, Guanyu Song, Igor Gilitschenski, Marco Pavone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of trajectory forecasting has grown significantly in recent years,
+partially owing to the release of numerous large-scale, real-world human
+trajectory datasets for autonomous vehicles (AVs) and pedestrian motion
+tracking. While such datasets have been a boon for the community, they each use
+custom and unique data formats and APIs, making it cumbersome for researchers
+to train and evaluate methods across multiple datasets. To remedy this, we
+present trajdata: a unified interface to multiple human trajectory datasets. At
+its core, trajdata provides a simple, uniform, and efficient representation and
+API for trajectory and map data. As a demonstration of its capabilities, in
+this work we conduct a comprehensive empirical evaluation of existing
+trajectory datasets, providing users with a rich understanding of the data
+underpinning much of current pedestrian and AV motion forecasting research, and
+proposing suggestions for future datasets from these insights. trajdata is
+permissively licensed (Apache 2.0) and can be accessed online at
+https://github.com/NVlabs/trajdata
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 15 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Points-to-3D: Bridging the Gap between Sparse Points and
+  Shape-Controllable Text-to-3D Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13908v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13908v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaohui Yu, Qiang Zhou, Jingliang Li, Zhe Zhang, Zhibin Wang, Fan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-3D generation has recently garnered significant attention, fueled by
+2D diffusion models trained on billions of image-text pairs. Existing methods
+primarily rely on score distillation to leverage the 2D diffusion priors to
+supervise the generation of 3D models, e.g., NeRF. However, score distillation
+is prone to suffer the view inconsistency problem, and implicit NeRF modeling
+can also lead to an arbitrary shape, thus leading to less realistic and
+uncontrollable 3D generation. In this work, we propose a flexible framework of
+Points-to-3D to bridge the gap between sparse yet freely available 3D points
+and realistic shape-controllable 3D generation by distilling the knowledge from
+both 2D and 3D diffusion models. The core idea of Points-to-3D is to introduce
+controllable sparse 3D points to guide the text-to-3D generation. Specifically,
+we use the sparse point cloud generated from the 3D diffusion model, Point-E,
+as the geometric prior, conditioned on a single reference image. To better
+utilize the sparse 3D points, we propose an efficient point cloud guidance loss
+to adaptively drive the NeRF's geometry to align with the shape of the sparse
+3D points. In addition to controlling the geometry, we propose to optimize the
+NeRF for a more view-consistent appearance. To be specific, we perform score
+distillation to the publicly available 2D image diffusion model ControlNet,
+conditioned on text as well as depth map of the learned compact geometry.
+Qualitative and quantitative comparisons demonstrate that Points-to-3D improves
+view consistency and achieves good shape controllability for text-to-3D
+generation. Points-to-3D provides users with a new way to improve and control
+text-to-3D generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ YOLOBench: Benchmarking Efficient Object Detectors on Embedded Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivan Lazarevich, Matteo Grimaldi, Ravish Kumar, Saptarshi Mitra, Shahrukh Khan, Sudhakar Sah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present YOLOBench, a benchmark comprised of 550+ YOLO-based object
+detection models on 4 different datasets and 4 different embedded hardware
+platforms (x86 CPU, ARM CPU, Nvidia GPU, NPU). We collect accuracy and latency
+numbers for a variety of YOLO-based one-stage detectors at different model
+scales by performing a fair, controlled comparison of these detectors with a
+fixed training environment (code and training hyperparameters).
+Pareto-optimality analysis of the collected data reveals that, if modern
+detection heads and training techniques are incorporated into the learning
+process, multiple architectures of the YOLO series achieve a good
+accuracy-latency trade-off, including older models like YOLOv3 and YOLOv4. We
+also evaluate training-free accuracy estimators used in neural architecture
+search on YOLOBench and demonstrate that, while most state-of-the-art zero-cost
+accuracy estimators are outperformed by a simple baseline like MAC count, some
+of them can be effectively used to predict Pareto-optimal detection models. We
+showcase that by using a zero-cost proxy to identify a YOLO architecture
+competitive against a state-of-the-art YOLOv8 model on a Raspberry Pi 4 CPU.
+The code and data are available at
+https://github.com/Deeplite/deeplite-torch-zoo
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Regularizing Neural Networks with Meta-Learning Generative Models <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13899v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13899v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shin'ya Yamaguchi, Daiki Chijiwa, Sekitoshi Kanai, Atsutoshi Kumagai, Hisashi Kashima
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates methods for improving generative data augmentation
+for deep learning. Generative data augmentation leverages the synthetic samples
+produced by generative models as an additional dataset for classification with
+small dataset settings. A key challenge of generative data augmentation is that
+the synthetic data contain uninformative samples that degrade accuracy. This is
+because the synthetic samples do not perfectly represent class categories in
+real data and uniform sampling does not necessarily provide useful samples for
+tasks. In this paper, we present a novel strategy for generative data
+augmentation called meta generative regularization (MGR). To avoid the
+degradation of generative data augmentation, MGR utilizes synthetic samples in
+the regularization term for feature extractors instead of in the loss function,
+e.g., cross-entropy. These synthetic samples are dynamically determined to
+minimize the validation losses through meta-learning. We observed that MGR can
+avoid the performance degradation of na\"ive generative data augmentation and
+boost the baselines. Experiments on six datasets showed that MGR is effective
+particularly when datasets are smaller and stably outperforms baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Data-centric Machine Learning Research (DMLR) Workshop at
+  ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AViT: Adapting Vision <span class="highlight-title">Transformer</span>s for Small Skin Lesion Segmentation
+  <span class="highlight-title">Dataset</span>s <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyi Du, Nourhan Bayasi, Ghassan Harmarneh, Rafeef Garbi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Skin lesion segmentation (SLS) plays an important role in skin lesion
+analysis. Vision transformers (ViTs) are considered an auspicious solution for
+SLS, but they require more training data compared to convolutional neural
+networks (CNNs) due to their inherent parameter-heavy structure and lack of
+some inductive biases. To alleviate this issue, current approaches fine-tune
+pre-trained ViT backbones on SLS datasets, aiming to leverage the knowledge
+learned from a larger set of natural images to lower the amount of skin
+training data needed. However, fully fine-tuning all parameters of large
+backbones is computationally expensive and memory intensive. In this paper, we
+propose AViT, a novel efficient strategy to mitigate ViTs' data-hunger by
+transferring any pre-trained ViTs to the SLS task. Specifically, we integrate
+lightweight modules (adapters) within the transformer layers, which modulate
+the feature representation of a ViT without updating its pre-trained weights.
+In addition, we employ a shallow CNN as a prompt generator to create a prompt
+embedding from the input image, which grasps fine-grained information and CNN's
+inductive biases to guide the segmentation task on small datasets. Our
+quantitative experiments on 4 skin lesion datasets demonstrate that AViT
+achieves competitive, and at times superior, performance to SOTA but with
+significantly fewer trainable parameters. Our code is available at
+https://github.com/siyi-wind/AViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, accepted by MICCAI ISIC Workshop 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open Problems in Computer Vision for Wilderness SAR and The Search for
+  Patricia Wu-Murad 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14527v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14527v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Manzini, Robin Murphy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper details the challenges in applying two computer vision systems, an
+EfficientDET supervised learning model and the unsupervised RX spectral
+classifier, to 98.9 GB of drone imagery from the Wu-Murad wilderness search and
+rescue (WSAR) effort in Japan and identifies 3 directions for future research.
+There have been at least 19 proposed approaches and 3 datasets aimed at
+locating missing persons in drone imagery, but only 3 approaches (2
+unsupervised and 1 of an unknown structure) are referenced in the literature as
+having been used in an actual WSAR operation. Of these proposed approaches, the
+EfficientDET architecture and the unsupervised spectral RX classifier were
+selected as the most appropriate for this setting. The EfficientDET model was
+applied to the HERIDAL dataset and despite achieving performance that is
+statistically equivalent to the state-of-the-art, the model fails to translate
+to the real world in terms of false positives (e.g., identifying tree limbs and
+rocks as people), and false negatives (e.g., failing to identify members of the
+search team). The poor results in practice for algorithms that showed good
+results on datasets suggest 3 areas of future research: more realistic datasets
+for wilderness SAR, computer vision models that are capable of seamlessly
+handling the variety of imagery that can be collected during actual WSAR
+operations, and better alignment on performance measures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards multi-modal anatomical landmark detection for ultrasound-guided
+  brain tumor resection with contrastive learning <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14523v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14523v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soorena Salari, Amirhossein Rasoulian, Hassan Rivaz, Yiming Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Homologous anatomical landmarks between medical scans are instrumental in
+quantitative assessment of image registration quality in various clinical
+applications, such as MRI-ultrasound registration for tissue shift correction
+in ultrasound-guided brain tumor resection. While manually identified landmark
+pairs between MRI and ultrasound (US) have greatly facilitated the validation
+of different registration algorithms for the task, the procedure requires
+significant expertise, labor, and time, and can be prone to inter- and
+intra-rater inconsistency. So far, many traditional and machine learning
+approaches have been presented for anatomical landmark detection, but they
+primarily focus on mono-modal applications. Unfortunately, despite the clinical
+needs, inter-modal/contrast landmark detection has very rarely been attempted.
+Therefore, we propose a novel contrastive learning framework to detect
+corresponding landmarks between MRI and intra-operative US scans in
+neurosurgery. Specifically, two convolutional neural networks were trained
+jointly to encode image features in MRI and US scans to help match the US image
+patch that contain the corresponding landmarks in the MRI. We developed and
+validated the technique using the public RESECT database. With a mean landmark
+detection accuracy of 5.88+-4.79 mm against 18.78+-4.77 mm with SIFT features,
+the proposed method offers promising results for MRI-US landmark detection in
+neurosurgical applications for the first time.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Patterns of Vehicle Lights: Addressing Complexities in Curation and
+  Annotation of Camera-Based Vehicle Light <span class="highlight-title">Dataset</span>s and Metrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14521v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14521v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ross Greer, Akshay Gopalkrishnan, Maitrayee Keskar, Mohan Trivedi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the representation of vehicle lights in computer vision
+and its implications for various tasks in the field of autonomous driving.
+Different specifications for representing vehicle lights, including bounding
+boxes, center points, corner points, and segmentation masks, are discussed in
+terms of their strengths and weaknesses. Three important tasks in autonomous
+driving that can benefit from vehicle light detection are identified: nighttime
+vehicle detection, 3D vehicle orientation estimation, and dynamic trajectory
+cues. Each task may require a different representation of the light. The
+challenges of collecting and annotating large datasets for training data-driven
+models are also addressed, leading to introduction of the LISA Vehicle Lights
+Dataset and associated Light Visibility Model, which provides light annotations
+specifically designed for downstream applications in vehicle detection, intent
+and trajectory prediction, and safe path planning. A comparison of existing
+vehicle light datasets is provided, highlighting the unique features and
+limitations of each dataset. Overall, this paper provides insights into the
+representation of vehicle lights and the importance of accurate annotations for
+training effective detection models in autonomous driving applications. Our
+dataset and model are made available at
+https://cvrr.ucsd.edu/vehicle-lights-dataset
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FocalErrorNet: Uncertainty-aware focal modulation network for
+  inter-modal registration error estimation in ultrasound-guided neurosurgery <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soorena Salari, Amirhossein Rasoulian, Hassan Rivaz, Yiming Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In brain tumor resection, accurate removal of cancerous tissues while
+preserving eloquent regions is crucial to the safety and outcomes of the
+treatment. However, intra-operative tissue deformation (called brain shift) can
+move the surgical target and render the pre-surgical plan invalid.
+Intra-operative ultrasound (iUS) has been adopted to provide real-time images
+to track brain shift, and inter-modal (i.e., MRI-iUS) registration is often
+required to update the pre-surgical plan. Quality control for the registration
+results during surgery is important to avoid adverse outcomes, but manual
+verification faces great challenges due to difficult 3D visualization and the
+low contrast of iUS. Automatic algorithms are urgently needed to address this
+issue, but the problem was rarely attempted. Therefore, we propose a novel deep
+learning technique based on 3D focal modulation in conjunction with uncertainty
+estimation to accurately assess MRI-iUS registration errors for brain tumor
+surgery. Developed and validated with the public RESECT clinical database, the
+resulting algorithm can achieve an estimation error of 0.59+-0.57 mm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Co-12 Recipe for Evaluating Interpretable Part-Prototype Image
+  Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14517v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14517v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meike Nauta, Christin Seifert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpretable part-prototype models are computer vision models that are
+explainable by design. The models learn prototypical parts and recognise these
+components in an image, thereby combining classification and explanation.
+Despite the recent attention for intrinsically interpretable models, there is
+no comprehensive overview on evaluating the explanation quality of
+interpretable part-prototype models. Based on the Co-12 properties for
+explanation quality as introduced in arXiv:2201.08164 (e.g., correctness,
+completeness, compactness), we review existing work that evaluates
+part-prototype models, reveal research gaps and outline future approaches for
+evaluation of the explanation quality of part-prototype models. This paper,
+therefore, contributes to the progression and maturity of this relatively new
+research field on interpretable part-prototype models. We additionally provide
+a ``Co-12 cheat sheet'' that acts as a concise summary of our findings on
+evaluating part-prototype models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 1 image, accepted at the 1st World Conference on
+  eXplainable Artificial Intelligence (xAI 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SuperInpaint: Learning Detail-Enhanced Attentional Implicit
+  Representation for Super-resolutional Image Inpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14489v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14489v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Canyu Zhang, Qing Guo, Xiaoguang Li, Renjie Wan, Hongkai Yu, Ivor Tsang, Song Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce a challenging image restoration task, referred to
+as SuperInpaint, which aims to reconstruct missing regions in low-resolution
+images and generate completed images with arbitrarily higher resolutions. We
+have found that this task cannot be effectively addressed by stacking
+state-of-the-art super-resolution and image inpainting methods as they amplify
+each other's flaws, leading to noticeable artifacts. To overcome these
+limitations, we propose the detail-enhanced attentional implicit representation
+(DEAR) that can achieve SuperInpaint with a single model, resulting in
+high-quality completed images with arbitrary resolutions. Specifically, we use
+a deep convolutional network to extract the latent embedding of an input image
+and then enhance the high-frequency components of the latent embedding via an
+adaptive high-pass filter. This leads to detail-enhanced semantic embedding. We
+further feed the semantic embedding into an unmask-attentional module that
+suppresses embeddings from ineffective masked pixels. Additionally, we extract
+a pixel-wise importance map that indicates which pixels should be used for
+image reconstruction. Given the coordinates of a pixel we want to reconstruct,
+we first collect its neighboring pixels in the input image and extract their
+detail-enhanced semantic embeddings, unmask-attentional semantic embeddings,
+importance values, and spatial distances to the desired pixel. Then, we feed
+all the above terms into an implicit representation and generate the color of
+the specified pixel. To evaluate our method, we extend three existing datasets
+for this new task and build 18 meaningful baselines using SOTA inpainting and
+super-resolution methods. Extensive experimental results demonstrate that our
+method outperforms all existing methods by a significant margin on four widely
+used metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Technical note: ShinyAnimalCV: open-source cloud-based web application
+  for object detection, segmentation, and three-dimensional visualization of
+  animals using computer vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14487v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14487v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jin Wang, Yu Hu, Lirong Xiang, Gota Morota, Samantha A. Brooks, Carissa L. Wickens, Emily K. Miller-Cushon, Haipeng Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computer vision (CV), a non-intrusive and cost-effective technology, has
+furthered the development of precision livestock farming by enabling optimized
+decision-making through timely and individualized animal care. The availability
+of affordable two- and three-dimensional camera sensors, combined with various
+machine learning and deep learning algorithms, has provided a valuable
+opportunity to improve livestock production systems. However, despite the
+availability of various CV tools in the public domain, applying these tools to
+animal data can be challenging, often requiring users to have programming and
+data analysis skills, as well as access to computing resources. Moreover, the
+rapid expansion of precision livestock farming is creating a growing need to
+educate and train animal science students in CV. This presents educators with
+the challenge of efficiently demonstrating the complex algorithms involved in
+CV. Thus, the objective of this study was to develop ShinyAnimalCV, an
+open-source cloud-based web application. This application provides a
+user-friendly interface for performing CV tasks, including object segmentation,
+detection, three-dimensional surface visualization, and extraction of two- and
+three-dimensional morphological features. Nine pre-trained CV models using
+top-view animal data are included in the application. ShinyAnimalCV has been
+deployed online using cloud computing platforms. The source code of
+ShinyAnimalCV is available on GitHub, along with detailed documentation on
+training CV models using custom data and deploying ShinyAnimalCV locally to
+allow users to fully leverage the capabilities of the application.
+ShinyAnimalCV can contribute to CV research and teaching in the animal science
+community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Role of Image Acquisition and Patient Phenotype Variations in Automatic
+  Segmentation Model Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14482v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14482v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timothy L. Kline, Sumana Ramanathan, Harrison C. Gottlich, Panagiotis Korfiatis, Adriana V. Gregory
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: This study evaluated the out-of-domain performance and
+generalization capabilities of automated medical image segmentation models,
+with a particular focus on adaptation to new image acquisitions and disease
+type.
+  Materials: Datasets from both non-contrast and contrast-enhanced abdominal CT
+scans of healthy patients and those with polycystic kidney disease (PKD) were
+used. A total of 400 images (100 non-contrast controls, 100 contrast controls,
+100 non-contrast PKD, 100 contrast PKD) were utilized for training/validation
+of models to segment kidneys, livers, and spleens, and the final models were
+then tested on 100 non-contrast CT images of patients affected by PKD.
+Performance was evaluated using Dice, Jaccard, TPR, and Precision.
+  Results: Models trained on a diverse range of data showed no worse
+performance than models trained exclusively on in-domain data when tested on
+in-domain data. For instance, the Dice similarity of the model trained on 25%
+from each dataset was found to be non-inferior to the model trained purely on
+in-domain data.
+  Conclusions: The results indicate that broader training examples
+significantly enhances model generalization and out-of-domain performance,
+thereby improving automated segmentation tools' applicability in clinical
+settings. The study's findings provide a roadmap for future research to adopt a
+data-centric approach in medical image AI model development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MiDaS v3.1 -- A Model Zoo for Robust Monocular Relative Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14460v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14460v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reiner Birkl, Diana Wofk, Matthias Müller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We release MiDaS v3.1 for monocular depth estimation, offering a variety of
+new models based on different encoder backbones. This release is motivated by
+the success of transformers in computer vision, with a large variety of
+pretrained vision transformers now available. We explore how using the most
+promising vision transformers as image encoders impacts depth estimation
+quality and runtime of the MiDaS architecture. Our investigation also includes
+recent convolutional approaches that achieve comparable quality to vision
+transformers in image classification tasks. While the previous release MiDaS
+v3.0 solely leverages the vanilla vision transformer ViT, MiDaS v3.1 offers
+additional models based on BEiT, Swin, SwinV2, Next-ViT and LeViT. These models
+offer different performance-runtime tradeoffs. The best model improves the
+depth estimation quality by 28% while efficient models enable downstream tasks
+requiring high frame rates. We also describe the general process for
+integrating new backbones. A video summarizing the work can be found at
+https://youtu.be/UjaeNNFf9sE and the code is available at
+https://github.com/isl-org/MiDaS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> Few-shot Learning for Semantic Segmentation: An
+  Annotation-free Approach <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14446v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14446v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanaz Karimijafarbigloo, Reza Azad, Dorit Merhof
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot semantic segmentation (FSS) offers immense potential in the field of
+medical image analysis, enabling accurate object segmentation with limited
+training data. However, existing FSS techniques heavily rely on annotated
+semantic classes, rendering them unsuitable for medical images due to the
+scarcity of annotations. To address this challenge, multiple contributions are
+proposed: First, inspired by spectral decomposition methods, the problem of
+image decomposition is reframed as a graph partitioning task. The eigenvectors
+of the Laplacian matrix, derived from the feature affinity matrix of
+self-supervised networks, are analyzed to estimate the distribution of the
+objects of interest from the support images. Secondly, we propose a novel
+self-supervised FSS framework that does not rely on any annotation. Instead, it
+adaptively estimates the query mask by leveraging the eigenvectors obtained
+from the support images. This approach eliminates the need for manual
+annotation, making it particularly suitable for medical images with limited
+annotated data. Thirdly, to further enhance the decoding of the query image
+based on the information provided by the support image, we introduce a
+multi-scale large kernel attention module. By selectively emphasizing relevant
+features and details, this module improves the segmentation process and
+contributes to better object delineation. Evaluations on both natural and
+medical image datasets demonstrate the efficiency and effectiveness of our
+method. Moreover, the proposed approach is characterized by its generality and
+model-agnostic nature, allowing for seamless integration with various deep
+architectures. The code is publicly available at
+\href{https://github.com/mindflow-institue/annotation_free_fewshot}{\textcolor{magenta}{GitHub}}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023 workshop PRIME</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Phenotype-preserving metric design for high-content image reconstruction
+  by generative inpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14436v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14436v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vaibhav Sharma, Artur Yakimovich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the past decades, automated high-content microscopy demonstrated its
+ability to deliver large quantities of image-based data powering the
+versatility of phenotypic drug screening and systems biology applications.
+However, as the sizes of image-based datasets grew, it became infeasible for
+humans to control, avoid and overcome the presence of imaging and sample
+preparation artefacts in the images. While novel techniques like machine
+learning and deep learning may address these shortcomings through generative
+image inpainting, when applied to sensitive research data this may come at the
+cost of undesired image manipulation. Undesired manipulation may be caused by
+phenomena such as neural hallucinations, to which some artificial neural
+networks are prone. To address this, here we evaluate the state-of-the-art
+inpainting methods for image restoration in a high-content fluorescence
+microscopy dataset of cultured cells with labelled nuclei. We show that
+architectures like DeepFill V2 and Edge Connect can faithfully restore
+microscopy images upon fine-tuning with relatively little data. Our results
+demonstrate that the area of the region to be restored is of higher importance
+than shape. Furthermore, to control for the quality of restoration, we propose
+a novel phenotype-preserving metric design strategy. In this strategy, the size
+and count of the restored biological phenotypes like cell nuclei are quantified
+to penalise undesirable manipulation. We argue that the design principles of
+our approach may also generalise to other applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures, conference proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProtoASNet: Dynamic Prototypes for Inherently Interpretable and
+  Uncertainty-Aware Aortic Stenosis Classification in Echocardiography <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14433v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14433v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hooman Vaseli, Ang Nan Gu, S. Neda Ahmadi Amiri, Michael Y. Tsang, Andrea Fung, Nima Kondori, Armin Saadat, Purang Abolmaesumi, Teresa S. M. Tsang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aortic stenosis (AS) is a common heart valve disease that requires accurate
+and timely diagnosis for appropriate treatment. Most current automatic AS
+severity detection methods rely on black-box models with a low level of
+trustworthiness, which hinders clinical adoption. To address this issue, we
+propose ProtoASNet, a prototypical network that directly detects AS from B-mode
+echocardiography videos, while making interpretable predictions based on the
+similarity between the input and learned spatio-temporal prototypes. This
+approach provides supporting evidence that is clinically relevant, as the
+prototypes typically highlight markers such as calcification and restricted
+movement of aortic valve leaflets. Moreover, ProtoASNet utilizes abstention
+loss to estimate aleatoric uncertainty by defining a set of prototypes that
+capture ambiguity and insufficient information in the observed data. This
+provides a reliable system that can detect and explain when it may fail. We
+evaluate ProtoASNet on a private dataset and the publicly available TMED-2
+dataset, where it outperforms existing state-of-the-art methods with an
+accuracy of 80.0% and 79.7%, respectively. Furthermore, ProtoASNet provides
+interpretability and an uncertainty measure for each prediction, which can
+improve transparency and facilitate the interactive usage of deep networks to
+aid clinical decision-making. Our source code is available at:
+https://github.com/hooman007/ProtoASNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Deep Learning-based Pansharpening with Jointly-Enhanced
+  Spectral and Spatial Fidelity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Ciotola, Giovanni Poggi, Giuseppe Scarpa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In latest years, deep learning has gained a leading role in the pansharpening
+of multiresolution images. Given the lack of ground truth data, most deep
+learning-based methods carry out supervised training in a reduced-resolution
+domain. However, models trained on downsized images tend to perform poorly on
+high-resolution target images. For this reason, several research groups are now
+turning to unsupervised training in the full-resolution domain, through the
+definition of appropriate loss functions and training paradigms. In this
+context, we have recently proposed a full-resolution training framework which
+can be applied to many existing architectures.
+  Here, we propose a new deep learning-based pansharpening model that fully
+exploits the potential of this approach and provides cutting-edge performance.
+Besides architectural improvements with respect to previous work, such as the
+use of residual attention modules, the proposed model features a novel loss
+function that jointly promotes the spectral and spatial quality of the
+pansharpened data. In addition, thanks to a new fine-tuning strategy, it
+improves inference-time adaptation to target images. Experiments on a large
+variety of test images, performed in challenging scenarios, demonstrate that
+the proposed method compares favorably with the state of the art both in terms
+of numerical results and visual output. Code is available online at
+https://github.com/matciotola/Lambda-PNN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UP<span class="highlight-title">GPT</span>: Universal Diffusion Model for Person Image Generation, Editing
+  and Pose Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08870v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08870v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soon Yau Cheong, Armin Mustafa, Andrew Gilbert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image models (T2I) such as StableDiffusion have been used to generate
+high quality images of people. However, due to the random nature of the
+generation process, the person has a different appearance e.g. pose, face, and
+clothing, despite using the same text prompt. The appearance inconsistency
+makes T2I unsuitable for pose transfer. We address this by proposing a
+multimodal diffusion model that accepts text, pose, and visual prompting. Our
+model is the first unified method to perform all person image tasks -
+generation, pose transfer, and mask-less edit. We also pioneer using small
+dimensional 3D body model parameters directly to demonstrate new capability -
+simultaneous pose and camera view interpolation while maintaining the person's
+appearance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Early Detection of Bark Beetle Attack Using Remote Sensing and Machine
+  Learning: A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.03829v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.03829v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyed Mojtaba Marvasti-Zadeh, Devin Goodsman, Nilanjan Ray, Nadir Erbilgin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper provides a comprehensive review of past and current advances in
+the early detection of bark beetle-induced tree mortality from three primary
+perspectives: bark beetle & host interactions, RS, and ML/DL. In contrast to
+prior efforts, this review encompasses all RS systems and emphasizes ML/DL
+methods to investigate their strengths and weaknesses. We parse existing
+literature based on multi- or hyper-spectral analyses and distill their
+knowledge based on: bark beetle species & attack phases with a primary emphasis
+on early stages of attacks, host trees, study regions, RS platforms & sensors,
+spectral/spatial/temporal resolutions, spectral signatures, spectral vegetation
+indices (SVIs), ML approaches, learning schemes, task categories, models,
+algorithms, classes/clusters, features, and DL networks & architectures.
+Although DL-based methods and the random forest (RF) algorithm showed promising
+results, highlighting their potential to detect subtle changes across visible,
+thermal, and short-wave infrared (SWIR) spectral regions, they still have
+limited effectiveness and high uncertainties. To inspire novel solutions to
+these shortcomings, we delve into the principal challenges & opportunities from
+different perspectives, enabling a deeper understanding of the current state of
+research and guiding future research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMBench: Is Your Multi-modal Model an All-around Player? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06281v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06281v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Liu, Haodong Duan, Yuanhan Zhang, Bo Li, Songyang Zhang, Wangbo Zhao, Yike Yuan, Jiaqi Wang, Conghui He, Ziwei Liu, Kai Chen, Dahua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models have recently achieved remarkable progress,
+exhibiting great perception and reasoning abilities concerning visual
+information. However, how to effectively evaluate these large vision-language
+models remains a major obstacle, hindering future model development.
+Traditional benchmarks like VQAv2 or COCO Caption provide quantitative
+performance measurements but suffer from a lack of fine-grained ability
+assessment and non-robust evaluation metrics. Recent subjective benchmarks,
+such as OwlEval, offer comprehensive evaluations of a model's abilities by
+incorporating human labor, but they are not scalable and display significant
+bias. In response to these challenges, we propose MMBench, a novel
+multi-modality benchmark. MMBench methodically develops a comprehensive
+evaluation pipeline, primarily comprised of two elements. The first element is
+a meticulously curated dataset that surpasses existing similar benchmarks in
+terms of the number and variety of evaluation questions and abilities. The
+second element introduces a novel CircularEval strategy and incorporates the
+use of ChatGPT. This implementation is designed to convert free-form
+predictions into pre-defined choices, thereby facilitating a more robust
+evaluation of the model's predictions. MMBench is a systematically-designed
+objective benchmark for robustly evaluating the various abilities of
+vision-language models. We hope MMBench will assist the research community in
+better evaluating their models and encourage future advancements in this
+domain. Project page: https://opencompass.org.cn/mmbench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Event-based Stereo Visual Odometry with Native Temporal Resolution via
+  Continuous-time Gaussian Process Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01188v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01188v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianeng Wang, Jonathan D. Gammell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event-based cameras asynchronously capture individual visual changes in a
+scene. This makes them more robust than traditional frame-based cameras to
+highly dynamic motions and poor illumination. It also means that every
+measurement in a scene can occur at a unique time.
+  Handling these different measurement times is a major challenge of using
+event-based cameras. It is often addressed in visual odometry (VO) pipelines by
+approximating temporally close measurements as occurring at one common time.
+This grouping simplifies the estimation problem but, absent additional sensors,
+sacrifices the inherent temporal resolution of event-based cameras.
+  This paper instead presents a complete stereo VO pipeline that estimates
+directly with individual event-measurement times without requiring any grouping
+or approximation in the estimation state. It uses continuous-time trajectory
+estimation to maintain the temporal fidelity and asynchronous nature of
+event-based cameras through Gaussian process regression with a physically
+motivated prior. Its performance is evaluated on the MVSEC dataset, where it
+achieves 7.9e-3 and 5.9e-3 RMS relative error on two independent sequences,
+outperforming the existing publicly available event-based stereo VO pipeline by
+two and four times, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE Robotics and Automation Letters (RA-L). Manuscript
+  #23-1314. 8 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unifying Flow, Stereo and Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.05783v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.05783v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haofei Xu, Jing Zhang, Jianfei Cai, Hamid Rezatofighi, Fisher Yu, Dacheng Tao, Andreas Geiger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a unified formulation and model for three motion and 3D perception
+tasks: optical flow, rectified stereo matching and unrectified stereo depth
+estimation from posed images. Unlike previous specialized architectures for
+each specific task, we formulate all three tasks as a unified dense
+correspondence matching problem, which can be solved with a single model by
+directly comparing feature similarities. Such a formulation calls for
+discriminative feature representations, which we achieve using a Transformer,
+in particular the cross-attention mechanism. We demonstrate that
+cross-attention enables integration of knowledge from another image via
+cross-view interactions, which greatly improves the quality of the extracted
+features. Our unified model naturally enables cross-task transfer since the
+model architecture and parameters are shared across tasks. We outperform RAFT
+with our unified model on the challenging Sintel dataset, and our final model
+that uses a few additional task-specific refinement steps outperforms or
+compares favorably to recent state-of-the-art methods on 10 popular flow,
+stereo and depth datasets, while being simpler and more efficient in terms of
+model design and inference speed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TPAMI 2023, Project Page: https://haofeixu.github.io/unimatch, Code:
+  https://github.com/autonomousvision/unimatch, Demo:
+  https://huggingface.co/spaces/haofeixu/unimatch</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dominating Set Database Selection for Visual Place Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05123v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05123v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasiia Kornilova, Ivan Moskalenko, Timofei Pushkin, Fakhriddin Tojiboev, Rahim Tariverdizadeh, Gonzalo Ferrer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an approach for creating a visual place recognition (VPR)
+database for localization in indoor environments from RGBD scanning sequences.
+The proposed approach is formulated as a minimization problem in terms of
+dominating set algorithm for graph, constructed from spatial information, and
+referred as DominatingSet. Our algorithm shows better scene coverage in
+comparison to other methodologies that are used for database creation. Also, we
+demonstrate that using DominatingSet, a database size could be up to 250-1400
+times smaller than the original scanning sequence while maintaining a recall
+rate of more than 80% on testing sequences. We evaluated our algorithm on
+7-scenes and BundleFusion datasets and an additionally recorded sequence in a
+highly repetitive office setting. In addition, the database selection can
+produce weakly-supervised labels for fine-tuning neural place recognition
+algorithms to particular settings, improving even more their accuracy. The
+paper also presents a fully automated pipeline for VPR database creation from
+RGBD scanning sequences, as well as a set of metrics for VPR database
+evaluation. The code and released data are available on our web-page~ --
+https://prime-slam.github.io/place-recognition-db/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Landmarks Motion from Speech for Speaker-Agnostic 3D Talking
+  Heads Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01415v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01415v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Nocentini, Claudio Ferrari, Stefano Berretti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach for generating 3D talking heads from raw
+audio inputs. Our method grounds on the idea that speech related movements can
+be comprehensively and efficiently described by the motion of a few control
+points located on the movable parts of the face, i.e., landmarks. The
+underlying musculoskeletal structure then allows us to learn how their motion
+influences the geometrical deformations of the whole face. The proposed method
+employs two distinct models to this aim: the first one learns to generate the
+motion of a sparse set of landmarks from the given audio. The second model
+expands such landmarks motion to a dense motion field, which is utilized to
+animate a given 3D mesh in neutral state. Additionally, we introduce a novel
+loss function, named Cosine Loss, which minimizes the angle between the
+generated motion vectors and the ground truth ones. Using landmarks in 3D
+talking head generation offers various advantages such as consistency,
+reliability, and obviating the need for manual-annotation. Our approach is
+designed to be identity-agnostic, enabling high-quality facial animations for
+any users without additional data or training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-modal Manifold Cutmix for <span class="highlight-title">Self-supervised</span> Video Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.03906v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.03906v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Srijan Das, Michael S. Ryoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address the challenge of obtaining large-scale unlabelled
+video datasets for contrastive representation learning in real-world
+applications. We present a novel video augmentation technique for
+self-supervised learning, called Cross-Modal Manifold Cutmix (CMMC), which
+generates augmented samples by combining different modalities in videos. By
+embedding a video tesseract into another across two modalities in the feature
+space, our method enhances the quality of learned video representations. We
+perform extensive experiments on two small-scale video datasets, UCF101 and
+HMDB51, for action recognition and video retrieval tasks. Our approach is also
+shown to be effective on the NTU dataset with limited domain knowledge. Our
+CMMC achieves comparable performance to other self-supervised methods while
+using less training data for both downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MVA 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MICDIR: Multi-scale Inverse-consistent Deformable Image Registration
+  using UNetMSS with Self-Constructing Graph Latent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.04317v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.04317v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soumick Chatterjee, Himanshi Bajaj, Istiyak H. Siddiquee, Nandish Bandi Subbarayappa, Steve Simon, Suraj Bangalore Shashidhar, Oliver Speck, Andreas Nürnberge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image registration is the process of bringing different images into a common
+coordinate system - a technique widely used in various applications of computer
+vision, such as remote sensing, image retrieval, and, most commonly, medical
+imaging. Deep learning based techniques have been applied successfully to
+tackle various complex medical image processing problems, including medical
+image registration. Over the years, several image registration techniques have
+been proposed using deep learning. Deformable image registration techniques
+such as Voxelmorph have been successful in capturing finer changes and
+providing smoother deformations. However, Voxelmorph, as well as ICNet and
+FIRE, do not explicitly encode global dependencies (i.e. the overall anatomical
+view of the supplied image) and, therefore, cannot track large deformations. In
+order to tackle the aforementioned problems, this paper extends the Voxelmorph
+approach in three different ways. To improve the performance in case of small
+as well as large deformations, supervision of the model at different
+resolutions has been integrated using a multi-scale UNet. To support the
+network to learn and encode the minute structural co-relations of the given
+image-pairs, a self-constructing graph network (SCGNet) has been used as the
+latent of the multi-scale UNet - which can improve the learning process of the
+model and help the model to generalise better. And finally, to make the
+deformations inverse-consistent, cycle consistency loss has been employed. On
+the task of registration of brain MRIs, the proposed method achieved
+significant improvements over ANTs and VoxelMorph, obtaining a Dice score of
+0.8013 \pm 0.0243 for intramodal and 0.6211 \pm 0.0309 for intermodal, while
+VoxelMorph achieved 0.7747 \pm 0.0260 and 0.6071 \pm 0.0510, respectively
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FacEDiM: A Face Embedding Distribution Model for Few-Shot Biometric
+  Authentication of Cattle <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14831v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14831v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meshia Cédric Oveneke, Rucha Vaishampayan, Deogratias Lukamba Nsadisa, Jenny Ambukiyenyi Onya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work proposes to solve the problem of few-shot biometric authentication
+by computing the Mahalanobis distance between testing embeddings and a
+multivariate Gaussian distribution of training embeddings obtained using
+pre-trained CNNs. Experimental results show that models pre-trained on the
+ImageNet dataset significantly outperform models pre-trained on human faces.
+With a VGG16 model, we obtain a FRR of 1.25% for a FAR of 1.18% on a dataset of
+20 cattle identities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 1 figure, 1 table, paper accepted at Black In AI at the 36th
+  Conference on Neural Information Processing Systems (NeurIPS 2022), New
+  Orleans, USA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PKU-GoodsAD: A Supermarket Goods <span class="highlight-title">Dataset</span> for Unsupervised Anomaly
+  Detection and Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04956v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04956v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Zhang, Runwei Ding, Miaoju Ban, Ge Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual anomaly detection is essential and commonly used for many tasks in the
+field of computer vision. Recent anomaly detection datasets mainly focus on
+industrial automated inspection, medical image analysis and video surveillance.
+In order to broaden the application and research of anomaly detection in
+unmanned supermarkets and smart manufacturing, we introduce the supermarket
+goods anomaly detection (GoodsAD) dataset. It contains 6124 high-resolution
+images of 484 different appearance goods divided into 6 categories. Each
+category contains several common different types of anomalies such as
+deformation, surface damage and opened. Anomalies contain both texture changes
+and structural changes. It follows the unsupervised setting and only normal
+(defect-free) images are used for training. Pixel-precise ground truth regions
+are provided for all anomalies. Moreover, we also conduct a thorough evaluation
+of current state-of-the-art unsupervised anomaly detection methods. This
+initial benchmark indicates that some methods which perform well on the
+industrial anomaly detection dataset (e.g., MVTec AD), show poor performance on
+our dataset. This is a comprehensive, multi-object dataset for supermarket
+goods anomaly detection that focuses on real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AMAE: Adaptation of <span class="highlight-title">Pre-Train</span>ed Masked Autoencoder for Dual-Distribution
+  Anomaly Detection in Chest X-Rays <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12721v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12721v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Behzad Bozorgtabar, Dwarikanath Mahapatra, Jean-Philippe Thiran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised anomaly detection in medical images such as chest radiographs is
+stepping into the spotlight as it mitigates the scarcity of the labor-intensive
+and costly expert annotation of anomaly data. However, nearly all existing
+methods are formulated as a one-class classification trained only on
+representations from the normal class and discard a potentially significant
+portion of the unlabeled data. This paper focuses on a more practical setting,
+dual distribution anomaly detection for chest X-rays, using the entire training
+data, including both normal and unlabeled images. Inspired by a modern
+self-supervised vision transformer model trained using partial image inputs to
+reconstruct missing image regions -- we propose AMAE, a two-stage algorithm for
+adaptation of the pre-trained masked autoencoder (MAE). Starting from MAE
+initialization, AMAE first creates synthetic anomalies from only normal
+training images and trains a lightweight classifier on frozen transformer
+features. Subsequently, we propose an adaptation strategy to leverage unlabeled
+images containing anomalies. The adaptation scheme is accomplished by assigning
+pseudo-labels to unlabeled images and using two separate MAE based modules to
+model the normative and anomalous distributions of pseudo-labeled images. The
+effectiveness of the proposed adaptation strategy is evaluated with different
+anomaly ratios in an unlabeled training set. AMAE leads to consistent
+performance gains over competing self-supervised and dual distribution anomaly
+detection methods, setting the new state-of-the-art on three public chest X-ray
+benchmarks: RSNA, NIH-CXR, and VinDr-CXR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Sequence Descriptor based on Spatio-Temporal Attention for
+  Visual Place Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11467v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11467v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fenglin Zhang, Junqiao Zhao, Yingfeng Cai, Gengxuan Tian, Wenjie Mu, Chen Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Place Recognition (VPR) aims to retrieve frames from a geotagged
+database that are located at the same place as the query frame. To improve the
+robustness of VPR in perceptually aliasing scenarios, sequence-based VPR
+methods are proposed. These methods are either based on matching between frame
+sequences or extracting sequence descriptors for direct retrieval. However, the
+former is usually based on the assumption of constant velocity, which is
+difficult to hold in practice, and is computationally expensive and subject to
+sequence length. Although the latter overcomes these problems, existing
+sequence descriptors are constructed by aggregating features of multiple frames
+only, without interaction on temporal information, and thus cannot obtain
+descriptors with spatio-temporal discrimination. In this paper, we propose a
+sequence descriptor that effectively incorporates spatio-temporal information.
+Specifically, spatial attention within the same frame is utilized to learn
+spatial feature patterns, while attention in corresponding local regions of
+different frames is utilized to learn the persistence or change of features
+over time. We use a sliding window to control the temporal range of attention
+and use relative position encoding to construct sequential relationships
+between different features. This allows our descriptors to capture the
+intrinsic dynamics in a sequence of frames. Comprehensive experiments on
+challenging benchmark datasets show that the proposed approach outperforms
+recent state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> dense representation learning for live-cell microscopy
+  with time arrow prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05511v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05511v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Gallusser, Max Stieber, Martin Weigert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art object detection and segmentation methods for microscopy
+images rely on supervised machine learning, which requires laborious manual
+annotation of training data. Here we present a self-supervised method based on
+time arrow prediction pre-training that learns dense image representations from
+raw, unlabeled live-cell microscopy videos. Our method builds upon the task of
+predicting the correct order of time-flipped image regions via a single-image
+feature extractor followed by a time arrow prediction head that operates on the
+fused features. We show that the resulting dense representations capture
+inherently time-asymmetric biological processes such as cell divisions on a
+pixel-level. We furthermore demonstrate the utility of these representations on
+several live-cell microscopy datasets for detection and segmentation of
+dividing cells, as well as for cell state classification. Our method
+outperforms supervised methods, particularly when only limited ground truth
+annotations are available as is commonly the case in practice. We provide code
+at https://github.com/weigertlab/tarrow.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Few-shot Medical Image Segmentation via Cross-Reference <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09630v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09630v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Huang, Jianming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models have become the mainstream method for medical image
+segmentation, but they require a large manually labeled dataset for training
+and are difficult to extend to unseen categories. Few-shot segmentation(FSS)
+has the potential to address these challenges by learning new categories from a
+small number of labeled samples. The majority of the current methods employ a
+prototype learning architecture, which involves expanding support prototype
+vectors and concatenating them with query features to conduct conditional
+segmentation. However, such framework potentially focuses more on query
+features while may neglect the correlation between support and query features.
+In this paper, we propose a novel self-supervised few shot medical image
+segmentation network with Cross-Reference Transformer, which addresses the lack
+of interaction between the support image and the query image. We first enhance
+the correlation features between the support set image and the query image
+using a bidirectional cross-attention module. Then, we employ a cross-reference
+mechanism to mine and enhance the similar parts of support features and query
+features in high-dimensional channels. Experimental results show that the
+proposed model achieves good results on both CT dataset and MRI dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages,4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Factor Fields: A Unified Framework for Neural Fields and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01226v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01226v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anpei Chen, Zexiang Xu, Xinyue Wei, Siyu Tang, Hao Su, Andreas Geiger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Factor Fields, a novel framework for modeling and representing
+signals. Factor Fields decomposes a signal into a product of factors, each of
+which is represented by a neural or regular field representation operating on a
+coordinate transformed input signal. We show that this decomposition yields a
+unified framework that generalizes several recent signal representations
+including NeRF, PlenOxels, EG3D, Instant-NGP, and TensoRF. Moreover, the
+framework allows for the creation of powerful new signal representations, such
+as the Coefficient-Basis Factorization (CoBaFa) which we propose in this paper.
+As evidenced by our experiments, CoBaFa leads to improvements over previous
+fast reconstruction methods in terms of the three critical goals in neural
+signal representation: approximation quality, compactness and efficiency.
+Experimentally, we demonstrate that our representation achieves better image
+approximation quality on 2D image regression tasks, higher geometric quality
+when reconstructing 3D signed distance fields and higher compactness for
+radiance field reconstruction tasks compared to previous fast reconstruction
+methods. Besides, our CoBaFa representation enables generalization by sharing
+the basis across signals during training, enabling generalization tasks such as
+image regression with sparse observations and few-shot radiance field
+reconstruction. Project Page: https://apchenstu.github.io/FactorFields/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neglected Free Lunch -- Learning Image Classifiers Using Annotation
+  Byproducts <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17595v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17595v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyoon Han, Junsuk Choe, Seonghyeok Chun, John Joon Young Chung, Minsuk Chang, Sangdoo Yun, Jean Y. Song, Seong Joon Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised learning of image classifiers distills human knowledge into a
+parametric model through pairs of images and corresponding labels (X,Y). We
+argue that this simple and widely used representation of human knowledge
+neglects rich auxiliary information from the annotation procedure, such as the
+time-series of mouse traces and clicks left after image selection. Our insight
+is that such annotation byproducts Z provide approximate human attention that
+weakly guides the model to focus on the foreground cues, reducing spurious
+correlations and discouraging shortcut learning. To verify this, we create
+ImageNet-AB and COCO-AB. They are ImageNet and COCO training sets enriched with
+sample-wise annotation byproducts, collected by replicating the respective
+original annotation tasks. We refer to the new paradigm of training models with
+annotation byproducts as learning using annotation byproducts (LUAB). We show
+that a simple multitask loss for regressing Z together with Y already improves
+the generalisability and robustness of the learned models. Compared to the
+original supervised learning, LUAB does not require extra annotation costs.
+ImageNet-AB and COCO-AB are at https://github.com/naver-ai/NeglectedFreeLunch.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code & data at https://github.com/naver-ai/NeglectedFreeLunch. To be
+  presented at ICCV'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DNN-Compressed Domain Visual Recognition with Feature Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08000v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08000v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingpeng Deng, Lina J. Karam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning-based image compression was shown to achieve a competitive
+performance with state-of-the-art transform-based codecs. This motivated the
+development of new learning-based visual compression standards such as JPEG-AI.
+Of particular interest to these emerging standards is the development of
+learning-based image compression systems targeting both humans and machines.
+This paper is concerned with learning-based compression schemes whose
+compressed-domain representations can be utilized to perform visual processing
+and computer vision tasks directly in the compressed domain. In our work, we
+adopt a learning-based compressed-domain classification framework for
+performing visual recognition using the compressed-domain latent representation
+at varying bit-rates. We propose a novel feature adaptation module integrating
+a lightweight attention model to adaptively emphasize and enhance the key
+features within the extracted channel-wise information. Also, we design an
+adaptation training strategy to utilize the pretrained pixel-domain weights.
+For comparison, in addition to the performance results that are obtained using
+our proposed latent-based compressed-domain method, we also present performance
+results using compressed but fully decoded images in the pixel domain as well
+as original uncompressed images. The obtained performance results show that our
+proposed compressed-domain classification model can distinctly outperform the
+existing compressed-domain classification models, and that it can also yield
+similar accuracy results with a much higher computational efficiency as
+compared to the pixel-domain models that are trained using fully decoded
+images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TD-GEM: Text-Driven Garment Editing Mapper 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18120v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18120v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reza Dadfar, Sanaz Sabzevari, Mårten Björkman, Danica Kragic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language-based fashion image editing allows users to try out variations of
+desired garments through provided text prompts. Inspired by research on
+manipulating latent representations in StyleCLIP and HairCLIP, we focus on
+these latent spaces for editing fashion items of full-body human datasets.
+Currently, there is a gap in handling fashion image editing due to the
+complexity of garment shapes and textures and the diversity of human poses. In
+this paper, we propose an editing optimizer scheme method called Text-Driven
+Garment Editing Mapper (TD-GEM), aiming to edit fashion items in a disentangled
+way. To this end, we initially obtain a latent representation of an image
+through generative adversarial network inversions such as Encoder for Editing
+(e4e) or Pivotal Tuning Inversion (PTI) for more accurate results. An
+optimization-based Contrastive Language-Image Pre-training (CLIP) is then
+utilized to guide the latent representation of a fashion image in the direction
+of a target attribute expressed in terms of a text prompt. Our TD-GEM
+manipulates the image accurately according to the target attribute, while other
+parts of the image are kept untouched. In the experiments, we evaluate TD-GEM
+on two different attributes (i.e., "color" and "sleeve length"), which
+effectively generates realistic images compared to the recent manipulation
+schemes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first two authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FsaNet: Frequency Self-attention for Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15595v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15595v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengyu Zhang, Ashkan Panahi, Guangjun Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Considering the spectral properties of images, we propose a new
+self-attention mechanism with highly reduced computational complexity, up to a
+linear rate. To better preserve edges while promoting similarity within
+objects, we propose individualized processes over different frequency bands. In
+particular, we study a case where the process is merely over low-frequency
+components. By ablation study, we show that low frequency self-attention can
+achieve very close or better performance relative to full frequency even
+without retraining the network. Accordingly, we design and embed novel
+plug-and-play modules to the head of a CNN network that we refer to as FsaNet.
+The frequency self-attention 1) requires only a few low frequency coefficients
+as input, 2) can be mathematically equivalent to spatial domain self-attention
+with linear structures, 3) simplifies token mapping ($1\times1$ convolution)
+stage and token mixing stage simultaneously. We show that frequency
+self-attention requires $87.29\% \sim 90.04\%$ less memory, $96.13\% \sim
+98.07\%$ less FLOPs, and $97.56\% \sim 98.18\%$ in run time than the regular
+self-attention. Compared to other ResNet101-based self-attention networks,
+\ourM achieves a new \sArt result ($83.0\%$ mIoU) on Cityscape test dataset and
+competitive results on ADE20k and VOCaug. \ourM can also enhance MASK R-CNN for
+instance segmentation on COCO. In addition, utilizing the proposed module,
+Segformer can be boosted on a series of models with different scales, and
+Segformer-B5 can be improved even without retraining. Code is accessible at
+\url{https://github.com/zfy-csu/FsaNet
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EgoVSR: Towards High-Quality Egocentric Video Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14708v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14708v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichen Chi, Junhao Gu, Jiamiao Zhang, Wenming Yang, Yapeng Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the limitations of capture devices and scenarios, egocentric videos
+frequently have low visual quality, mainly caused by high compression and
+severe motion blur. With the increasing application of egocentric videos, there
+is an urgent need to enhance the quality of these videos through
+super-resolution. However, existing Video Super-Resolution (VSR) works,
+focusing on third-person view videos, are actually unsuitable for handling
+blurring artifacts caused by rapid ego-motion and object motion in egocentric
+videos. To this end, we propose EgoVSR, a VSR framework specifically designed
+for egocentric videos. We explicitly tackle motion blurs in egocentric videos
+using a Dual Branch Deblur Network (DB$^2$Net) in the VSR framework. Meanwhile,
+a blurring mask is introduced to guide the DB$^2$Net learning, and can be used
+to localize blurred areas in video frames. We also design a MaskNet to predict
+the mask, as well as a mask loss to optimize the mask estimation. Additionally,
+an online motion blur synthesis model for common VSR training data is proposed
+to simulate motion blurs as in egocentric videos. In order to validate the
+effectiveness of our proposed method, we introduce an EgoVSR dataset containing
+a large amount of fast-motion egocentric video sequences. Extensive experiments
+demonstrate that our EgoVSR model can efficiently super-resolve low-quality
+egocentric videos and outperform strong comparison baselines. Our code,
+pre-trained models and data can be found at https://github.com/chiyich/EGOVSR/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Null-text Guidance in Diffusion Models is Secretly a Cartoon-style
+  Creator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06710v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06710v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Zhao, Heliang Zheng, Chaoyue Wang, Long Lan, Wanrong Huang, Wenjing Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classifier-free guidance is an effective sampling technique in diffusion
+models that has been widely adopted. The main idea is to extrapolate the model
+in the direction of text guidance and away from null-text guidance. In this
+paper, we demonstrate that null-text guidance in diffusion models is secretly a
+cartoon-style creator, i.e., the generated images can be efficiently
+transformed into cartoons by simply perturbing the null-text guidance.
+Specifically, we proposed two disturbance methods, i.e., Rollback disturbance
+(Back-D) and Image disturbance (Image-D), to construct misalignment between the
+noisy images used for predicting null-text guidance and text guidance
+(subsequently referred to as \textbf{null-text noisy image} and \textbf{text
+noisy image} respectively) in the sampling process. Back-D achieves
+cartoonization by altering the noise level of null-text noisy image via
+replacing $x_t$ with $x_{t+\Delta t}$. Image-D, alternatively, produces
+high-fidelity, diverse cartoons by defining $x_t$ as a clean input image, which
+further improves the incorporation of finer image details. Through
+comprehensive experiments, we delved into the principle of noise disturbing for
+null-text and uncovered that the efficacy of disturbance depends on the
+correlation between the null-text noisy image and the source image. Moreover,
+our proposed techniques, which can generate cartoon images and cartoonize
+specific ones, are training-free and easily integrated as a plug-and-play
+component in any classifier-free guided diffusion model. Project page is
+available at \url{https://nulltextforcartoon.github.io/}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning Based 3D Segmentation: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.05423v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.05423v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong He, Hongshan Yu, Xiaoyan Liu, Zhengeng Yang, Wei Sun, Ajmal Mian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D segmentation is a fundamental and challenging problem in computer vision
+with applications in autonomous driving, robotics, augmented reality and
+medical image analysis. It has received significant attention from the computer
+vision, graphics and machine learning communities. Conventional methods for 3D
+segmentation, based on hand-crafted features and machine learning classifiers,
+lack generalization ability. Driven by their success in 2D computer vision,
+deep learning techniques have recently become the tool of choice for 3D
+segmentation tasks. This has led to an influx of a large number of methods in
+the literature that have been evaluated on different benchmark datasets.
+Whereas survey papers on RGB-D and point cloud segmentation exist, there is a
+lack of an in-depth and recent survey that covers all 3D data modalities and
+application domains. This paper fills the gap and provides a comprehensive
+survey of the recent progress made in deep learning based 3D segmentation. It
+covers over 180 works, analyzes their strengths and limitations and discusses
+their competitive results on benchmark datasets. The survey provides a summary
+of the most commonly used pipelines and finally highlights promising research
+directions for the future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 10 tables, 8 figures, update the transformer-based methods
+  for 3D segmentation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scale-Aware Modulation Meet <span class="highlight-title">Transformer</span> <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08579v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08579v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weifeng Lin, Ziheng Wu, Jiayu Chen, Jun Huang, Lianwen Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a new vision Transformer, Scale-Aware Modulation
+Transformer (SMT), that can handle various downstream tasks efficiently by
+combining the convolutional network and vision Transformer. The proposed
+Scale-Aware Modulation (SAM) in the SMT includes two primary novel designs.
+Firstly, we introduce the Multi-Head Mixed Convolution (MHMC) module, which can
+capture multi-scale features and expand the receptive field. Secondly, we
+propose the Scale-Aware Aggregation (SAA) module, which is lightweight but
+effective, enabling information fusion across different heads. By leveraging
+these two modules, convolutional modulation is further enhanced. Furthermore,
+in contrast to prior works that utilized modulations throughout all stages to
+build an attention-free network, we propose an Evolutionary Hybrid Network
+(EHN), which can effectively simulate the shift from capturing local to global
+dependencies as the network becomes deeper, resulting in superior performance.
+Extensive experiments demonstrate that SMT significantly outperforms existing
+state-of-the-art models across a wide range of visual tasks. Specifically, SMT
+with 11.5M / 2.4GFLOPs and 32M / 7.7GFLOPs can achieve 82.2% and 84.3% top-1
+accuracy on ImageNet-1K, respectively. After pretrained on ImageNet-22K in
+224^2 resolution, it attains 87.1% and 88.1% top-1 accuracy when finetuned with
+resolution 224^2 and 384^2, respectively. For object detection with Mask R-CNN,
+the SMT base trained with 1x and 3x schedule outperforms the Swin Transformer
+counterpart by 4.2 and 1.3 mAP on COCO, respectively. For semantic segmentation
+with UPerNet, the SMT base test at single- and multi-scale surpasses Swin by
+2.0 and 1.1 mIoU respectively on the ADE20K.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HOICLIP: Efficient Knowledge Transfer for HOI Detection with
+  Vision-Language Models <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15786v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15786v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shan Ning, Longtian Qiu, Yongfei Liu, Xuming He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-Object Interaction (HOI) detection aims to localize human-object pairs
+and recognize their interactions. Recently, Contrastive Language-Image
+Pre-training (CLIP) has shown great potential in providing interaction prior
+for HOI detectors via knowledge distillation. However, such approaches often
+rely on large-scale training data and suffer from inferior performance under
+few/zero-shot scenarios. In this paper, we propose a novel HOI detection
+framework that efficiently extracts prior knowledge from CLIP and achieves
+better generalization. In detail, we first introduce a novel interaction
+decoder to extract informative regions in the visual feature map of CLIP via a
+cross-attention mechanism, which is then fused with the detection backbone by a
+knowledge integration block for more accurate human-object pair detection. In
+addition, prior knowledge in CLIP text encoder is leveraged to generate a
+classifier by embedding HOI descriptions. To distinguish fine-grained
+interactions, we build a verb classifier from training data via visual semantic
+arithmetic and a lightweight verb representation adapter. Furthermore, we
+propose a training-free enhancement to exploit global HOI predictions from
+CLIP. Extensive experiments demonstrate that our method outperforms the state
+of the art by a large margin on various settings, e.g. +4.04 mAP on HICO-Det.
+The source code is available in https://github.com/Artanic30/HOICLIP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2023.Open sourced, Code and Model Available</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Stable Signature: Rooting Watermarks in Latent Diffusion Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15435v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15435v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Fernandez, Guillaume Couairon, Hervé Jégou, Matthijs Douze, Teddy Furon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative image modeling enables a wide range of applications but raises
+ethical concerns about responsible deployment. This paper introduces an active
+strategy combining image watermarking and Latent Diffusion Models. The goal is
+for all generated images to conceal an invisible watermark allowing for future
+detection and/or identification. The method quickly fine-tunes the latent
+decoder of the image generator, conditioned on a binary signature. A
+pre-trained watermark extractor recovers the hidden signature from any
+generated image and a statistical test then determines whether it comes from
+the generative model. We evaluate the invisibility and robustness of the
+watermarks on a variety of generation tasks, showing that Stable Signature
+works even after the images are modified. For instance, it detects the origin
+of an image generated from a text prompt, then cropped to keep $10\%$ of the
+content, with $90$+$\%$ accuracy at a false positive rate below 10$^{-6}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICCV 2023. Code at
+  https://github.com/facebookresearch/stable_signature - webpage at
+  https://pierrefdz.github.io/publications/stablesignature</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FlipNeRF: Flipped Reflection Rays for Few-shot Novel View Synthesis <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17723v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17723v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seunghyeon Seo, Yeonjin Chang, Nojun Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Field (NeRF) has been a mainstream in novel view synthesis
+with its remarkable quality of rendered images and simple architecture.
+Although NeRF has been developed in various directions improving continuously
+its performance, the necessity of a dense set of multi-view images still exists
+as a stumbling block to progress for practical application. In this work, we
+propose FlipNeRF, a novel regularization method for few-shot novel view
+synthesis by utilizing our proposed flipped reflection rays. The flipped
+reflection rays are explicitly derived from the input ray directions and
+estimated normal vectors, and play a role of effective additional training rays
+while enabling to estimate more accurate surface normals and learn the 3D
+geometry effectively. Since the surface normal and the scene depth are both
+derived from the estimated densities along a ray, the accurate surface normal
+leads to more exact depth estimation, which is a key factor for few-shot novel
+view synthesis. Furthermore, with our proposed Uncertainty-aware Emptiness Loss
+and Bottleneck Feature Consistency Loss, FlipNeRF is able to estimate more
+reliable outputs with reducing floating artifacts effectively across the
+different scene structures, and enhance the feature-level consistency between
+the pair of the rays cast toward the photo-consistent pixels without any
+additional feature extractor, respectively. Our FlipNeRF achieves the SOTA
+performance on the multiple benchmarks across all the scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023. Project Page: https://shawn615.github.io/flipnerf/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AlphaNet: Improving Long-Tail Classification By Combining Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2008.07073v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2008.07073v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nadine Chang, Jayanth Koushik, Aarti Singh, Martial Hebert, Yu-Xiong Wang, Michael J. Tarr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods in long-tail learning focus on improving performance for data-poor
+(rare) classes; however, performance for such classes remains much lower than
+performance for more data-rich (frequent) classes. Analyzing the predictions of
+long-tail methods for rare classes reveals that a large number of errors are
+due to misclassification of rare items as visually similar frequent classes. To
+address this problem, we introduce AlphaNet, a method that can be applied to
+existing models, performing post hoc correction on classifiers of rare classes.
+Starting with a pre-trained model, we find frequent classes that are closest to
+rare classes in the model's representation space and learn weights to update
+rare class classifiers with a linear combination of frequent class classifiers.
+AlphaNet, applied to several models, greatly improves test accuracy for rare
+classes in multiple long-tailed datasets, with very little change to overall
+accuracy. Our method also provides a way to control the trade-off between rare
+class and overall accuracy, making it practical for long-tail classification in
+the wild.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Priors in Deep Image Restoration and Enhancement: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02070v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02070v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunfan Lu, Yiqi Lin, Hao Wu, Yunhao Luo, Xu Zheng, Hui Xiong, Lin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image restoration and enhancement is a process of improving the image quality
+by removing degradations, such as noise, blur, and resolution degradation. Deep
+learning (DL) has recently been applied to image restoration and enhancement.
+Due to its ill-posed property, plenty of works have been explored priors to
+facilitate training deep neural networks (DNNs). However, the importance of
+priors has not been systematically studied and analyzed by far in the research
+community. Therefore, this paper serves as the first study that provides a
+comprehensive overview of recent advancements in priors for deep image
+restoration and enhancement. Our work covers five primary contents: (1) A
+theoretical analysis of priors for deep image restoration and enhancement; (2)
+A hierarchical and structural taxonomy of priors commonly used in the DL-based
+methods; (3) An insightful discussion on each prior regarding its principle,
+potential, and applications; (4) A summary of crucial problems by highlighting
+the potential future directions, especially adopting the large-scale foundation
+models as prior, to spark more research in the community; (5) An open-source
+repository that provides a taxonomy of all mentioned works and code links.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scene Graph Generation from Hierarchical Relationship Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06842v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06842v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Jiang, Camillo J. Taylor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach for inferring relationships between
+objects in visual scenes. It explicitly exploits an informative hierarchical
+structure that can be imposed to divide the object and relationship categories
+into disjoint super-categories. Specifically, our proposed method incorporates
+a Bayes prediction head, enabling joint predictions of the super-category as
+the type of relationship between the two objects, along with the detailed
+relationship within that super-category. This design reduces the impact of
+class imbalance problems. Furthermore, we also modify the supervised
+contrastive learning to adapt our hierarchical classification scheme.
+Experimental evaluations on the Visual Genome and OpenImage V6 datasets
+demonstrate that this factorized approach allows a relatively simple model to
+achieve competitive performance, particularly in predicate classification and
+zero-shot tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DifFSS: Diffusion Model for Few-Shot Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00773v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00773v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weimin Tan, Siyuan Chen, Bo Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated excellent performance in image generation.
+Although various few-shot semantic segmentation (FSS) models with different
+network structures have been proposed, performance improvement has reached a
+bottleneck. This paper presents the first work to leverage the diffusion model
+for FSS task, called DifFSS. DifFSS, a novel FSS paradigm, can further improve
+the performance of the state-of-the-art FSS models by a large margin without
+modifying their network structure. Specifically, we utilize the powerful
+generation ability of diffusion models to generate diverse auxiliary support
+images by using the semantic mask, scribble or soft HED boundary of the support
+image as control conditions. This generation process simulates the variety
+within the class of the query image, such as color, texture variation,
+lighting, $etc$. As a result, FSS models can refer to more diverse support
+images, yielding more robust representations, thereby achieving a consistent
+improvement in segmentation performance. Extensive experiments on three
+publicly available datasets based on existing advanced FSS models demonstrate
+the effectiveness of the diffusion model for FSS task. Furthermore, we explore
+in detail the impact of different input settings of the diffusion model on
+segmentation performance. Hopefully, this completely new paradigm will bring
+inspiration to the study of FSS task integrated with AI-generated content.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code is available at https://github.com/TrinitialChan/DifFSS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAS Video-QA: Self-Adaptive Sampling for Efficient Video
+  Question-Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04192v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04192v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Han, Hui Chen, Min-Yen Kan, Soujanya Poria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video question--answering is a fundamental task in the field of video
+understanding. Although current vision--language models (VLMs) equipped with
+Video Transformers have enabled temporal modeling and yielded superior results,
+they are at the cost of huge computational power and thus too expensive to
+deploy in real-time application scenarios. An economical workaround only
+samples a small portion of frames to represent the main content of that video
+and tune an image--text model on these sampled frames. Recent video
+understanding models usually randomly sample a set of frames or clips,
+regardless of internal correlations between their visual contents, nor their
+relevance to the problem. We argue that such kinds of aimless sampling may omit
+the key frames from which the correct answer can be deduced, and the situation
+gets worse when the sampling sparsity increases, which always happens as the
+video lengths increase. To mitigate this issue, we propose two frame sampling
+strategies, namely the most domain frames (MDF) and most implied frames (MIF),
+to maximally preserve those frames that are most likely vital to the given
+questions. MDF passively minimizes the risk of key frame omission in a
+bootstrap manner, while MIS actively searches key frames customized for each
+video--question pair with the assistance of auxiliary models. The experimental
+results on three public datasets from three advanced VLMs (CLIP, GIT and
+All-in-one) demonstrate that our proposed strategies can boost the performance
+for image--text pretrained models. The source codes pertaining to the method
+proposed in this paper are publicly available at
+https://github.com/declare-lab/sas-vqa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Table and Image Generation for Investigating Knowledge of Entities in
+  <span class="highlight-title">Pre-train</span>ed Vision and Language Models <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02115v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02115v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hidetaka Kamigaito, Katsuhiko Hayashi, Taro Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a table and image generation task to verify how the
+knowledge about entities acquired from natural language is retained in Vision &
+Language (V&L) models. This task consists of two parts: the first is to
+generate a table containing knowledge about an entity and its related image,
+and the second is to generate an image from an entity with a caption and a
+table containing related knowledge of the entity. In both tasks, the model must
+know the entities used to perform the generation properly. We created the
+Wikipedia Table and Image Generation (WikiTIG) dataset from about 200,000
+infoboxes in English Wikipedia articles to perform the proposed tasks. We
+evaluated the performance on the tasks with respect to the above research
+question using the V&L model OFA, which has achieved state-of-the-art results
+in multiple tasks. Experimental results show that OFA forgets part of its
+entity knowledge by pre-training as a complement to improve the performance of
+image related tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MDViT: Multi-domain Vision <span class="highlight-title">Transformer</span> for Small Medical Image
+  Segmentation <span class="highlight-title">Dataset</span>s <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02100v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02100v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyi Du, Nourhan Bayasi, Ghassan Harmarneh, Rafeef Garbi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite its clinical utility, medical image segmentation (MIS) remains a
+daunting task due to images' inherent complexity and variability. Vision
+transformers (ViTs) have recently emerged as a promising solution to improve
+MIS; however, they require larger training datasets than convolutional neural
+networks. To overcome this obstacle, data-efficient ViTs were proposed, but
+they are typically trained using a single source of data, which overlooks the
+valuable knowledge that could be leveraged from other available datasets.
+Naivly combining datasets from different domains can result in negative
+knowledge transfer (NKT), i.e., a decrease in model performance on some domains
+with non-negligible inter-domain heterogeneity. In this paper, we propose
+MDViT, the first multi-domain ViT that includes domain adapters to mitigate
+data-hunger and combat NKT by adaptively exploiting knowledge in multiple small
+data resources (domains). Further, to enhance representation learning across
+domains, we integrate a mutual knowledge distillation paradigm that transfers
+knowledge between a universal network (spanning all the domains) and auxiliary
+domain-specific branches. Experiments on 4 skin lesion segmentation datasets
+show that MDViT outperforms state-of-the-art algorithms, with superior
+segmentation performance and a fixed model size, at inference time, even as
+more domains are added. Our code is available at
+https://github.com/siyi-wind/MDViT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, accepted by 26th International Conference on
+  Medical Image Computing and Computer Assisted Intervention (MICCAI 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reinforced Disentanglement for Face Swapping without Skip Connection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07928v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07928v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohang Ren, Xingyu Chen, Pengfei Yao, Heung-Yeung Shum, Baoyuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The SOTA face swap models still suffer the problem of either target identity
+(i.e., shape) being leaked or the target non-identity attributes (i.e.,
+background, hair) failing to be fully preserved in the final results. We show
+that this insufficient disentanglement is caused by two flawed designs that
+were commonly adopted in prior models: (1) counting on only one compressed
+encoder to represent both the semantic-level non-identity facial
+attributes(i.e., pose) and the pixel-level non-facial region details, which is
+contradictory to satisfy at the same time; (2) highly relying on long
+skip-connections between the encoder and the final generator, leaking a certain
+amount of target face identity into the result. To fix them, we introduce a new
+face swap framework called 'WSC-swap' that gets rid of skip connections and
+uses two target encoders to respectively capture the pixel-level non-facial
+region attributes and the semantic non-identity attributes in the face region.
+To further reinforce the disentanglement learning for the target encoder, we
+employ both identity removal loss via adversarial training (i.e., GAN) and the
+non-identity preservation loss via prior 3DMM models like [11]. Extensive
+experiments on both FaceForensics++ and CelebA-HQ show that our results
+significantly outperform previous works on a rich set of metrics, including one
+novel metric for measuring identity consistency that was completely neglected
+before.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedIIC: Towards Robust Federated Learning for Class-Imbalanced Medical
+  Image Classification <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.13803v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.13803v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nannan Wu, Li Yu, Xin Yang, Kwang-Ting Cheng, Zengqiang Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL), training deep models from decentralized data without
+privacy leakage, has shown great potential in medical image computing recently.
+However, considering the ubiquitous class imbalance in medical data, FL can
+exhibit performance degradation, especially for minority classes (e.g. rare
+diseases). Existing methods towards this problem mainly focus on training a
+balanced classifier to eliminate class prior bias among classes, but neglect to
+explore better representation to facilitate classification performance. In this
+paper, we present a privacy-preserving FL method named FedIIC to combat class
+imbalance from two perspectives: feature learning and classifier learning. In
+feature learning, two levels of contrastive learning are designed to extract
+better class-specific features with imbalanced data in FL. In classifier
+learning, per-class margins are dynamically set according to real-time
+difficulty and class priors, which helps the model learn classes equally.
+Experimental results on publicly-available datasets demonstrate the superior
+performance of FedIIC in dealing with both real-world and simulated
+multi-source medical imaging data under class imbalance. Code is available at
+https://github.com/wnn2000/FedIIC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Priming Cross-Session Motor Imagery Classification with A Universal Deep
+  Domain Adaptation Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.09559v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.09559v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengqing Miao, Xin Zhang, Carlo Menon, Yelong Zheng, Meirong Zhao, Dong Ming
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motor imagery (MI) is a common brain computer interface (BCI) paradigm. EEG
+is non-stationary with low signal-to-noise, classifying motor imagery tasks of
+the same participant from different EEG recording sessions is generally
+challenging, as EEG data distribution may vary tremendously among different
+acquisition sessions. Although it is intuitive to consider the cross-session MI
+classification as a domain adaptation problem, the rationale and feasible
+approach is not elucidated. In this paper, we propose a Siamese deep domain
+adaptation (SDDA) framework for cross-session MI classification based on
+mathematical models in domain adaptation theory. The proposed framework can be
+easily applied to most existing artificial neural networks without altering the
+network structure, which facilitates our method with great flexibility and
+transferability. In the proposed framework, domain invariants were firstly
+constructed jointly with channel normalization and Euclidean alignment. Then,
+embedding features from source and target domain were mapped into the
+Reproducing Kernel Hilbert Space (RKHS) and aligned accordingly. A cosine-based
+center loss was also integrated into the framework to improve the
+generalizability of the SDDA. The proposed framework was validated with two
+classic and popular convolutional neural networks from BCI research field
+(EEGNet and ConvNet) in two MI-EEG public datasets (BCI Competition IV IIA,
+IIB). Compared to the vanilla EEGNet and ConvNet, the proposed SDDA framework
+was able to boost the MI classification accuracy by 15.2%, 10.2% respectively
+in IIA dataset, and 5.5%, 4.2% in IIB dataset. The final MI classification
+accuracy reached 82.01% in IIA dataset and 87.52% in IIB, which outperformed
+the state-of-the-art methods in the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 5figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MetaDT: Meta Decision Tree with Class Hierarchy for Interpretable
+  Few-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.01482v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.01482v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baoquan Zhang, Hao Jiang, Xutao Li, Shanshan Feng, Yunming Ye, Rui Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-Shot Learning (FSL) is a challenging task, which aims to recognize novel
+classes with few examples. Recently, lots of methods have been proposed from
+the perspective of meta-learning and representation learning. However, few
+works focus on the interpretability of FSL decision process. In this paper, we
+take a step towards the interpretable FSL by proposing a novel meta-learning
+based decision tree framework, namely, MetaDT. In particular, the FSL
+interpretability is achieved from two aspects, i.e., a concept aspect and a
+visual aspect. On the concept aspect, we first introduce a tree-like concept
+hierarchy as FSL prior. Then, resorting to the prior, we split each few-shot
+task to a set of subtasks with different concept levels and then perform class
+prediction via a model of decision tree. The advantage of such design is that a
+sequence of high-level concept decisions that lead up to a final class
+prediction can be obtained, which clarifies the FSL decision process. On the
+visual aspect, a set of subtask-specific classifiers with visual attention
+mechanism is designed to perform decision at each node of the decision tree. As
+a result, a subtask-specific heatmap visualization can be obtained to achieve
+the decision interpretability of each tree node. At last, to alleviate the data
+scarcity issue of FSL, we regard the prior of concept hierarchy as an
+undirected graph, and then design a graph convolution-based decision tree
+inference network as our meta-learner to infer parameters of the decision tree.
+Extensive experiments on performance comparison and interpretability analysis
+show superiority of our MetaDT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Skeleton Meta-Prototype Contrastive Learning with Hard
+  Skeleton Mining for Unsupervised Person Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12917v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12917v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haocong Rao, Cyril Leung, Chunyan Miao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With rapid advancements in depth sensors and deep learning, skeleton-based
+person re-identification (re-ID) models have recently achieved remarkable
+progress with many advantages. Most existing solutions learn single-level
+skeleton features from body joints with the assumption of equal skeleton
+importance, while they typically lack the ability to exploit more informative
+skeleton features from various levels such as limb level with more global body
+patterns. The label dependency of these methods also limits their flexibility
+in learning more general skeleton representations. This paper proposes a
+generic unsupervised Hierarchical skeleton Meta-Prototype Contrastive learning
+(Hi-MPC) approach with Hard Skeleton Mining (HSM) for person re-ID with
+unlabeled 3D skeletons. Firstly, we construct hierarchical representations of
+skeletons to model coarse-to-fine body and motion features from the levels of
+body joints, components, and limbs. Then a hierarchical meta-prototype
+contrastive learning model is proposed to cluster and contrast the most typical
+skeleton features ("prototypes") from different-level skeletons. By converting
+original prototypes into meta-prototypes with multiple homogeneous
+transformations, we induce the model to learn the inherent consistency of
+prototypes to capture more effective skeleton features for person re-ID.
+Furthermore, we devise a hard skeleton mining mechanism to adaptively infer the
+informative importance of each skeleton, so as to focus on harder skeletons to
+learn more discriminative skeleton representations. Extensive evaluations on
+five datasets demonstrate that our approach outperforms a wide variety of
+state-of-the-art skeleton-based methods. We further show the general
+applicability of our method to cross-view person re-ID and RGB-based scenarios
+with estimated skeletons.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Journal of Computer Vision (IJCV). Codes
+  are available at https://github.com/Kali-Hac/Hi-MPC. Supplemental materials
+  will be included in the published version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Atmospheric Turbulence Correction via Variational Deep Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05077v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05077v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xijun Wang, Santiago López-Tapia, Aggelos K. Katsaggelos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Atmospheric Turbulence (AT) correction is a challenging restoration task as
+it consists of two distortions: geometric distortion and spatially variant
+blur. Diffusion models have shown impressive accomplishments in photo-realistic
+image synthesis and beyond. In this paper, we propose a novel deep conditional
+diffusion model under a variational inference framework to solve the AT
+correction problem. We use this framework to improve performance by learning
+latent prior information from the input and degradation processes. We use the
+learned information to further condition the diffusion model. Experiments are
+conducted in a comprehensive synthetic AT dataset. We show that the proposed
+framework achieves good quantitative and qualitative results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been accepted to the 2023 IEEE 6th International
+  Conference on Multimedia Information Processing and Retrieval (MIPR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video-Specific Query-Key Attention Modeling for Weakly-Supervised
+  Temporal Action Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04186v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04186v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xijun Wang, Aggelos K. Katsaggelos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-supervised temporal action localization aims to identify and localize
+the action instances in the untrimmed videos with only video-level action
+labels. When humans watch videos, we can adapt our abstract-level knowledge
+about actions in different video scenarios and detect whether some actions are
+occurring. In this paper, we mimic how humans do and bring a new perspective
+for locating and identifying multiple actions in a video. We propose a network
+named VQK-Net with a video-specific query-key attention modeling that learns a
+unique query for each action category of each input video. The learned queries
+not only contain the actions' knowledge features at the abstract level but also
+have the ability to fit this knowledge into the target video scenario, and they
+will be used to detect the presence of the corresponding action along the
+temporal dimension. To better learn these action category queries, we exploit
+not only the features of the current input video but also the correlation
+between different videos through a novel video-specific action category query
+learner worked with a query similarity loss. Finally, we conduct extensive
+experiments on three commonly used datasets (THUMOS14, ActivityNet1.2, and
+ActivityNet1.3) and achieve state-of-the-art performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Generalization Effects of Linear Transformations in Data
+  Augmentation <span class="chip">ICML 2020</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2005.00695v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2005.00695v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sen Wu, Hongyang R. Zhang, Gregory Valiant, Christopher Ré
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation is a powerful technique to improve performance in
+applications such as image and text classification tasks. Yet, there is little
+rigorous understanding of why and how various augmentations work. In this work,
+we consider a family of linear transformations and study their effects on the
+ridge estimator in an over-parametrized linear regression setting. First, we
+show that transformations that preserve the labels of the data can improve
+estimation by enlarging the span of the training data. Second, we show that
+transformations that mix data can improve estimation by playing a
+regularization effect. Finally, we validate our theoretical insights on MNIST.
+Based on the insights, we propose an augmentation scheme that searches over the
+space of transformations by how uncertain the model is about the transformed
+data. We validate our proposed scheme on image and text datasets. For example,
+our method outperforms random sampling methods by 1.24% on CIFAR-100 using
+Wide-ResNet-28-10. Furthermore, we achieve comparable accuracy to the SoTA
+Adversarial AutoAugment on CIFAR-10, CIFAR-100, SVHN, and ImageNet datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages. Appeared in ICML 2020</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Deep Perceptual Measure for Lens and Camera Calibration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.12300v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.12300v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yannick Hold-Geoffroy, Dominique Piché-Meunier, Kalyan Sunkavalli, Jean-Charles Bazin, François Rameau, Jean-François Lalonde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image editing and compositing have become ubiquitous in entertainment, from
+digital art to AR and VR experiences. To produce beautiful composites, the
+camera needs to be geometrically calibrated, which can be tedious and requires
+a physical calibration target. In place of the traditional multi-image
+calibration process, we propose to infer the camera calibration parameters such
+as pitch, roll, field of view, and lens distortion directly from a single image
+using a deep convolutional neural network. We train this network using
+automatically generated samples from a large-scale panorama dataset, yielding
+competitive accuracy in terms of standard `2 error. However, we argue that
+minimizing such standard error metrics might not be optimal for many
+applications. In this work, we investigate human sensitivity to inaccuracies in
+geometric camera calibration. To this end, we conduct a large-scale human
+perception study where we ask participants to judge the realism of 3D objects
+composited with correct and biased camera calibration parameters. Based on this
+study, we develop a new perceptual measure for camera calibration and
+demonstrate that our deep calibration network outperforms previous single-image
+based calibration methods both on standard metrics as well as on this novel
+perceptual measure. Finally, we demonstrate the use of our calibration network
+for several applications, including virtual object insertion, image retrieval,
+and compositing. A demonstration of our approach is available at
+https://lvsn.github.io/deepcalib .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 12 figures, project page (including live demo) available at
+  https://lvsn.github.io/deepcalib. arXiv admin note: text overlap with
+  arXiv:1712.01259</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual <span class="highlight-title">Pre-train</span>ing for Navigation: What Can We Learn from Noise? <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.00052v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.00052v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanwei Wang, Ching-Yun Ko, Pulkit Agrawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One powerful paradigm in visual navigation is to predict actions from
+observations directly. Training such an end-to-end system allows
+representations useful for downstream tasks to emerge automatically. However,
+the lack of inductive bias makes this system data inefficient. We hypothesize a
+sufficient representation of the current view and the goal view for a
+navigation policy can be learned by predicting the location and size of a crop
+of the current view that corresponds to the goal. We further show that training
+such random crop prediction in a self-supervised fashion purely on synthetic
+noise images transfers well to natural home images. The learned representation
+can then be bootstrapped to learn a navigation policy efficiently with little
+interaction data. The code is available at https://yanweiw.github.io/noise2ptz
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reasons for the Superiority of Stochastic Estimators over Deterministic
+  Ones: Robustness, Consistency and Perceptual Quality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08944v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08944v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guy Ohayon, Theo Adrai, Michael Elad, Tomer Michaeli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic restoration algorithms allow to explore the space of solutions
+that correspond to the degraded input. In this paper we reveal additional
+fundamental advantages of stochastic methods over deterministic ones, which
+further motivate their use. First, we prove that any restoration algorithm that
+attains perfect perceptual quality and whose outputs are consistent with the
+input must be a posterior sampler, and is thus required to be stochastic.
+Second, we illustrate that while deterministic restoration algorithms may
+attain high perceptual quality, this can be achieved only by filling up the
+space of all possible source images using an extremely sensitive mapping, which
+makes them highly vulnerable to adversarial attacks. Indeed, we show that
+enforcing deterministic models to be robust to such attacks profoundly hinders
+their perceptual quality, while robustifying stochastic models hardly
+influences their perceptual quality, and improves their output variability.
+These findings provide a motivation to foster progress in stochastic
+restoration methods, paving the way to better recovery algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DAE-Former: Dual Attention-guided Efficient <span class="highlight-title">Transformer</span> for Medical
+  Image Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.13504v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.13504v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reza Azad, René Arimond, Ehsan Khodapanah Aghdam, Amirhossein Kazerouni, Dorit Merhof
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have recently gained attention in the computer vision domain due
+to their ability to model long-range dependencies. However, the self-attention
+mechanism, which is the core part of the Transformer model, usually suffers
+from quadratic computational complexity with respect to the number of tokens.
+Many architectures attempt to reduce model complexity by limiting the
+self-attention mechanism to local regions or by redesigning the tokenization
+process. In this paper, we propose DAE-Former, a novel method that seeks to
+provide an alternative perspective by efficiently designing the self-attention
+mechanism. More specifically, we reformulate the self-attention mechanism to
+capture both spatial and channel relations across the whole feature dimension
+while staying computationally efficient. Furthermore, we redesign the skip
+connection path by including the cross-attention module to ensure the feature
+reusability and enhance the localization power. Our method outperforms
+state-of-the-art methods on multi-organ cardiac and skin lesion segmentation
+datasets without requiring pre-training weights. The code is publicly available
+at https://github.com/mindflow-institue/DAEFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023 PRIME workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LAMP: Leveraging Language <span class="highlight-title">Prompt</span>s for Multi-person Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11934v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11934v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengnan Hu, Ce Zheng, Zixiang Zhou, Chen Chen, Gita Sukthankar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-centric visual understanding is an important desideratum for effective
+human-robot interaction. In order to navigate crowded public places, social
+robots must be able to interpret the activity of the surrounding humans. This
+paper addresses one key aspect of human-centric visual understanding,
+multi-person pose estimation. Achieving good performance on multi-person pose
+estimation in crowded scenes is difficult due to the challenges of occluded
+joints and instance separation. In order to tackle these challenges and
+overcome the limitations of image features in representing invisible body
+parts, we propose a novel prompt-based pose inference strategy called LAMP
+(Language Assisted Multi-person Pose estimation). By utilizing the text
+representations generated by a well-trained language model (CLIP), LAMP can
+facilitate the understanding of poses on the instance and joint levels, and
+learn more robust visual representations that are less susceptible to
+occlusion. This paper demonstrates that language-supervised training boosts the
+performance of single-stage multi-person pose estimation, and both
+instance-level and joint-level prompts are valuable for training. The code is
+available at https://github.com/shengnanh20/LAMP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">12</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chat<span class="highlight-title">GPT</span> and Persuasive Technologies for the Management and Delivery of
+  Personalized Recommendations in Hotel Hospitality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manolis Remountakis, Konstantinos Kotis, Babis Kourtzis, George E. Tsekouras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems have become indispensable tools in the hotel hospitality
+industry, enabling personalized and tailored experiences for guests. Recent
+advancements in large language models (LLMs), such as ChatGPT, and persuasive
+technologies, have opened new avenues for enhancing the effectiveness of those
+systems. This paper explores the potential of integrating ChatGPT and
+persuasive technologies for automating and improving hotel hospitality
+recommender systems. First, we delve into the capabilities of ChatGPT, which
+can understand and generate human-like text, enabling more accurate and
+context-aware recommendations. We discuss the integration of ChatGPT into
+recommender systems, highlighting the ability to analyze user preferences,
+extract valuable insights from online reviews, and generate personalized
+recommendations based on guest profiles. Second, we investigate the role of
+persuasive technology in influencing user behavior and enhancing the persuasive
+impact of hotel recommendations. By incorporating persuasive techniques, such
+as social proof, scarcity and personalization, recommender systems can
+effectively influence user decision-making and encourage desired actions, such
+as booking a specific hotel or upgrading their room. To investigate the
+efficacy of ChatGPT and persuasive technologies, we present a pilot experi-ment
+with a case study involving a hotel recommender system. We aim to study the
+impact of integrating ChatGPT and persua-sive techniques on user engagement,
+satisfaction, and conversion rates. The preliminary results demonstrate the
+potential of these technologies in enhancing the overall guest experience and
+business performance. Overall, this paper contributes to the field of hotel
+hospitality by exploring the synergistic relationship between LLMs and
+persuasive technology in recommender systems, ultimately influencing guest
+satisfaction and hotel revenue.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models are Competitive Near Cold-start Recommenders for
+  Language- and Item-based Preferences <span class="chip">RecSys'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Scott Sanner, Krisztian Balog, Filip Radlinski, Ben Wedin, Lucas Dixon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional recommender systems leverage users' item preference history to
+recommend novel content that users may like. However, modern dialog interfaces
+that allow users to express language-based preferences offer a fundamentally
+different modality for preference input. Inspired by recent successes of
+prompting paradigms for large language models (LLMs), we study their use for
+making recommendations from both item-based and language-based preferences in
+comparison to state-of-the-art item-based collaborative filtering (CF) methods.
+To support this investigation, we collect a new dataset consisting of both
+item-based and language-based preferences elicited from users along with their
+ratings on a variety of (biased) recommended items and (unbiased) random items.
+Among numerous experimental results, we find that LLMs provide competitive
+recommendation performance for pure language-based preferences (no item
+preferences) in the near cold-start case in comparison to item-based CF
+methods, despite having no supervised training for this specific task
+(zero-shot) or only a few labels (few-shot). This is particularly promising as
+language-based preference representations are more explainable and scrutable
+than item-based or vector-based representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at RecSys'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Probabilistic Position Bias Model for Short-Video Recommendation Feeds <span class="chip">RecSys '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14059v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14059v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olivier Jeunen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern web-based platforms show ranked lists of recommendations to users,
+attempting to maximise user satisfaction or business metrics. Typically, the
+goal of such systems boils down to maximising the exposure probability for
+items that are deemed "reward-maximising" according to a metric of interest.
+This general framing comprises streaming applications, as well as e-commerce or
+job recommendations, and even web search. Position bias or user models can be
+used to estimate exposure probabilities for each use-case, specifically
+tailored to how users interact with the presented rankings. A unifying factor
+in these diverse problem settings is that typically only one or several items
+will be engaged with (clicked, streamed,...) before a user leaves the ranked
+list. Short-video feeds on social media platforms diverge from this general
+framing in several ways, most notably that users do not tend to leave the feed
+after e.g. liking a post. Indeed, seemingly infinite feeds invite users to
+scroll further down the ranked list. For this reason, existing position bias or
+user models tend to fall short in such settings, as they do not accurately
+capture users' interaction modalities.
+  In this work, we propose a novel and probabilistically sound personalised
+position bias model for feed recommendations. We focus on a 1st-level feed in a
+hierarchical structure, where users may enter a 2nd-level feed via any given
+1st-level item. We posit that users come to the platform with a scrolling
+budget drawn according to some distribution, and show how the survival function
+of said distribution can be used to obtain closed-form estimates for
+personalised exposure probabilities. Empirical insights from a large-scale
+social media platform show how our probabilistic position bias model more
+accurately captures empirical exposure than existing models, and paves the way
+for unbiased evaluation and learning-to-rank.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Appearing in the Proceedings of the Seventeenth ACM Conference on
+  Recommender Systems (RecSys '23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-view Hypergraph Contrastive Policy Learning for Conversational
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sen Zhao, Wei Wei, Xian-Ling Mao, Shuai Zhu, Minghui Yang, Zujie Wen, Dangyang Chen, Feida Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational recommendation systems (CRS) aim to interactively acquire user
+preferences and accordingly recommend items to users. Accurately learning the
+dynamic user preferences is of crucial importance for CRS. Previous works learn
+the user preferences with pairwise relations from the interactive conversation
+and item knowledge, while largely ignoring the fact that factors for a
+relationship in CRS are multiplex. Specifically, the user likes/dislikes the
+items that satisfy some attributes (Like/Dislike view). Moreover social
+influence is another important factor that affects user preference towards the
+item (Social view), while is largely ignored by previous works in CRS. The user
+preferences from these three views are inherently different but also correlated
+as a whole. The user preferences from the same views should be more similar
+than that from different views. The user preferences from Like View should be
+similar to Social View while different from Dislike View. To this end, we
+propose a novel model, namely Multi-view Hypergraph Contrastive Policy Learning
+(MHCPL). Specifically, MHCPL timely chooses useful social information according
+to the interactive history and builds a dynamic hypergraph with three types of
+multiplex relations from different views. The multiplex relations in each view
+are successively connected according to their generation order.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain Disentanglement with Interpolative Data Augmentation for
+  Dual-Target Cross-Domain Recommendation <span class="chip">RecSys 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13910v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13910v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajie Zhu, Yan Wang, Feng Zhu, Zhu Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional single-target Cross-Domain Recommendation (CDR) aims to
+improve the recommendation performance on a sparser target domain by
+transferring the knowledge from a source domain that contains relatively richer
+information. By contrast, in recent years, dual-target CDR has been proposed to
+improve the recommendation performance on both domains simultaneously. However,
+to this end, there are two challenges in dual-target CDR: (1) how to generate
+both relevant and diverse augmented user representations, and (2) how to
+effectively decouple domain-independent information from domain-specific
+information, in addition to domain-shared information, to capture comprehensive
+user preferences. To address the above two challenges, we propose a
+Disentanglement-based framework with Interpolative Data Augmentation for
+dual-target Cross-Domain Recommendation, called DIDA-CDR. In DIDA-CDR, we first
+propose an interpolative data augmentation approach to generating both relevant
+and diverse augmented user representations to augment sparser domain and
+explore potential user preferences. We then propose a disentanglement module to
+effectively decouple domain-specific and domain-independent information to
+capture comprehensive user preferences. Both steps significantly contribute to
+capturing more comprehensive user preferences, thereby improving the
+recommendation performance on each domain. Extensive experiments conducted on
+five real-world datasets show the significant superiority of DIDA-CDR over the
+state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by RecSys 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating Offline Reinforcement Learning with <span class="highlight-title">Transformer</span>s for
+  Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xumei Xi, Yuke Zhao, Quan Liu, Liwen Ouyang, Yang Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of sequential recommendation, where the current
+recommendation is made based on past interactions. This recommendation task
+requires efficient processing of the sequential data and aims to provide
+recommendations that maximize the long-term reward. To this end, we train a
+farsighted recommender by using an offline RL algorithm with the policy network
+in our model architecture that has been initialized from a pre-trained
+transformer model. The pre-trained model leverages the superb ability of the
+transformer to process sequential information. Compared to prior works that
+rely on online interaction via simulation, we focus on implementing a fully
+offline RL framework that is able to converge in a fast and stable way. Through
+extensive experiments on public datasets, we show that our method is robust
+across various recommendation regimes, including e-commerce and movie
+suggestions. Compared to state-of-the-art supervised learning algorithms, our
+algorithm yields recommendations of higher quality, demonstrating the clear
+advantage of combining RL and transformers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Measuring Americanization: A Global Quantitative Study of Interest in
+  American Topics on Wikipedia 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14401v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14401v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piotr Konieczny, Włodzimierz Lewoniewski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We conducted a global comparative analysis of the coverage of American topics
+in different language versions of Wikipedia, using over 90 million Wikidata
+items and 40 million Wikipedia articles in 58 languages. Our study aimed to
+investigate whether Americanization is more or less dominant in different
+regions and cultures and to determine whether interest in American topics is
+universal.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended and interactive version of bubble chart with Wikipedia
+  languages: https://data.lewoniewski.info/americanwiki</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mathematical Modeling of BCG-based Bladder Cancer Treatment Using
+  Socio-Demographics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15084v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15084v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elizaveta Savchenko, Ariel Rosenfeld, Svetlana Bunimovich-Mendrazitsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cancer is one of the most widespread diseases around the world with millions
+of new patients each year. Bladder cancer is one of the most prevalent types of
+cancer affecting all individuals alike with no obvious prototypical patient.
+The current standard treatment for BC follows a routine weekly Bacillus
+Calmette-Guerin (BCG) immunotherapy-based therapy protocol which is applied to
+all patients alike. The clinical outcomes associated with BCG treatment vary
+significantly among patients due to the biological and clinical complexity of
+the interaction between the immune system, treatments, and cancer cells. In
+this study, we take advantage of the patient's socio-demographics to offer a
+personalized mathematical model that describes the clinical dynamics associated
+with BCG-based treatment. To this end, we adopt a well-established BCG
+treatment model and integrate a machine learning component to temporally adjust
+and reconfigure key parameters within the model thus promoting its
+personalization. Using real clinical data, we show that our personalized model
+favorably compares with the original one in predicting the number of cancer
+cells at the end of the treatment, with 14.8% improvement, on average.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Bert</span>4XMR: Cross-Market Recommendation with Bidirectional Encoder
+  Representations from <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15145v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15145v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Hu, Satoshi Nakagawa, Shi-Min Cai, Fuji Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world multinational e-commerce companies, such as Amazon and eBay, serve
+in multiple countries and regions. Some markets are data-scarce, while others
+are data-rich. In recent years, cross-market recommendation (XMR) has been
+proposed to bolster data-scarce markets by leveraging auxiliary information
+from data-rich markets. Previous XMR algorithms have employed techniques such
+as sharing bottom or incorporating inter-market similarity to optimize the
+performance of XMR. However, the existing approaches suffer from two crucial
+limitations: (1) They ignore the co-occurrences of items provided by data-rich
+markets. (2) They do not adequately tackle the issue of negative transfer
+stemming from disparities across diverse markets. In order to address these
+limitations, we propose a novel session-based model called Bert4XMR, which is
+able to model item co-occurrences across markets and mitigate negative
+transfer. Specifically, we employ the pre-training and fine-tuning paradigm to
+facilitate knowledge transfer across markets. Pre-training occurs on global
+markets to learn item co-occurrences, while fine-tuning happens in the target
+market for model customization. To mitigate potential negative transfer, we
+separate the item representations into market embeddings and item embeddings.
+Market embeddings model the bias associated with different markets, while item
+embeddings learn generic item representations. Extensive experiments conducted
+on seven real-world datasets illustrate our model's effectiveness. It
+outperforms the suboptimal model by an average of $4.82\%$, $4.73\%$, $7.66\%$,
+and $6.49\%$ across four metrics. Through the ablation study, we experimentally
+demonstrate that the market embedding approach helps prevent negative transfer,
+especially in data-scarce markets. Our implementations are available at
+https://github.com/laowangzi/Bert4XMR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Hierarchical Policy Learning for Conversational Recommendation
+  with Hypergraph-based Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02575v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02575v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sen Zhao, Wei Wei, Yifan Liu, Ziyang Wang, Wendi Li, Xian-Ling Mao, Shuai Zhu, Minghui Yang, Zujie Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational recommendation systems (CRS) aim to timely and proactively
+acquire user dynamic preferred attributes through conversations for item
+recommendation. In each turn of CRS, there naturally have two decision-making
+processes with different roles that influence each other: 1) director, which is
+to select the follow-up option (i.e., ask or recommend) that is more effective
+for reducing the action space and acquiring user preferences; and 2) actor,
+which is to accordingly choose primitive actions (i.e., asked attribute or
+recommended item) that satisfy user preferences and give feedback to estimate
+the effectiveness of the director's option. However, existing methods heavily
+rely on a unified decision-making module or heuristic rules, while neglecting
+to distinguish the roles of different decision procedures, as well as the
+mutual influences between them. To address this, we propose a novel
+Director-Actor Hierarchical Conversational Recommender (DAHCR), where the
+director selects the most effective option, followed by the actor accordingly
+choosing primitive actions that satisfy user preferences. Specifically, we
+develop a dynamic hypergraph to model user preferences and introduce an
+intrinsic motivation to train from weak supervision over the director. Finally,
+to alleviate the bad effect of model bias on the mutual influence between the
+director and actor, we model the director's option by sampling from a
+categorical distribution. Extensive experiments demonstrate that DAHCR
+outperforms state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sequential Recommendation with Graph Neural Networks <span class="chip">SIGIR 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.14226v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.14226v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianxin Chang, Chen Gao, Yu Zheng, Yiqun Hui, Yanan Niu, Yang Song, Depeng Jin, Yong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation aims to leverage users' historical behaviors to
+predict their next interaction. Existing works have not yet addressed two main
+challenges in sequential recommendation. First, user behaviors in their rich
+historical sequences are often implicit and noisy preference signals, they
+cannot sufficiently reflect users' actual preferences. In addition, users'
+dynamic preferences often change rapidly over time, and hence it is difficult
+to capture user patterns in their historical sequences. In this work, we
+propose a graph neural network model called SURGE (short for SeqUential
+Recommendation with Graph neural nEtworks) to address these two issues.
+Specifically, SURGE integrates different types of preferences in long-term user
+behaviors into clusters in the graph by re-constructing loose item sequences
+into tight item-item interest graphs based on metric learning. This helps
+explicitly distinguish users' core interests, by forming dense clusters in the
+interest graph. Then, we perform cluster-aware and query-aware graph
+convolutional propagation and graph pooling on the constructed graph. It
+dynamically fuses and extracts users' current activated core interests from
+noisy user behavior sequences. We conduct extensive experiments on both public
+and proprietary industrial datasets. Experimental results demonstrate
+significant performance gains of our proposed method compared to
+state-of-the-art methods. Further studies on sequence length confirm that our
+method can model long behavioral sequences effectively and efficiently.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by SIGIR 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fairness in Recommendation: Foundations, Methods and Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.13619v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.13619v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunqi Li, Hanxiong Chen, Shuyuan Xu, Yingqiang Ge, Juntao Tan, Shuchang Liu, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As one of the most pervasive applications of machine learning, recommender
+systems are playing an important role on assisting human decision making. The
+satisfaction of users and the interests of platforms are closely related to the
+quality of the generated recommendation results. However, as a highly
+data-driven system, recommender system could be affected by data or algorithmic
+bias and thus generate unfair results, which could weaken the reliance of the
+systems. As a result, it is crucial to address the potential unfairness
+problems in recommendation settings. Recently, there has been growing attention
+on fairness considerations in recommender systems with more and more literature
+on approaches to promote fairness in recommendation. However, the studies are
+rather fragmented and lack a systematic organization, thus making it difficult
+to penetrate for new researchers to the domain. This motivates us to provide a
+systematic survey of existing works on fairness in recommendation. This survey
+focuses on the foundations for fairness in recommendation literature. It first
+presents a brief introduction about fairness in basic machine learning tasks
+such as classification and ranking in order to provide a general overview of
+fairness research, as well as introduce the more complex situations and
+challenges that need to be considered when studying fairness in recommender
+systems. After that, the survey will introduce fairness in recommendation with
+a focus on the taxonomies of current fairness definitions, the typical
+techniques for improving fairness, as well as the datasets for fairness studies
+in recommendation. The survey also talks about the challenges and opportunities
+in fairness research with the hope of promoting the fair recommendation
+research area and beyond.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 2 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">113</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TabR: Unlocking the Power of Retrieval-Augmented Tabular Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14338v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14338v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yury Gorishniy, Ivan Rubachev, Nikolay Kartashev, Daniil Shlenskii, Akim Kotelnikov, Artem Babenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) models for tabular data problems are receiving
+increasingly more attention, while the algorithms based on gradient-boosted
+decision trees (GBDT) remain a strong go-to solution. Following the recent
+trends in other domains, such as natural language processing and computer
+vision, several retrieval-augmented tabular DL models have been recently
+proposed. For a given target object, a retrieval-based model retrieves other
+relevant objects, such as the nearest neighbors, from the available (training)
+data and uses their features or even labels to make a better prediction.
+However, we show that the existing retrieval-based tabular DL solutions provide
+only minor, if any, benefits over the properly tuned simple retrieval-free
+baselines. Thus, it remains unclear whether the retrieval-based approach is a
+worthy direction for tabular DL.
+  In this work, we give a strong positive answer to this question. We start by
+incrementally augmenting a simple feed-forward architecture with an
+attention-like retrieval component similar to those of many (tabular)
+retrieval-based models. Then, we highlight several details of the attention
+mechanism that turn out to have a massive impact on the performance on tabular
+data problems, but that were not explored in prior work. As a result, we design
+TabR -- a simple retrieval-based tabular DL model which, on a set of public
+benchmarks, demonstrates the best average performance among tabular DL models,
+becomes the new state-of-the-art on several datasets, and even outperforms GBDT
+models on the recently proposed ``GBDT-friendly'' benchmark (see the first
+figure).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/yandex-research/tabular-dl-tabr</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Waypoint-Based Imitation Learning for Robotic Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucy Xiaoyang Shi, Archit Sharma, Tony Z. Zhao, Chelsea Finn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While imitation learning methods have seen a resurgent interest for robotic
+manipulation, the well-known problem of compounding errors continues to afflict
+behavioral cloning (BC). Waypoints can help address this problem by reducing
+the horizon of the learning problem for BC, and thus, the errors compounded
+over time. However, waypoint labeling is underspecified, and requires
+additional human supervision. Can we generate waypoints automatically without
+any additional human supervision? Our key insight is that if a trajectory
+segment can be approximated by linear motion, the endpoints can be used as
+waypoints. We propose Automatic Waypoint Extraction (AWE) for imitation
+learning, a preprocessing module to decompose a demonstration into a minimal
+set of waypoints which when interpolated linearly can approximate the
+trajectory up to a specified error threshold. AWE can be combined with any BC
+algorithm, and we find that AWE can increase the success rate of
+state-of-the-art algorithms by up to 25% in simulation and by 4-28% on
+real-world bimanual manipulation tasks, reducing the decision making horizon by
+up to a factor of 10. Videos and code are available at
+https://lucys0.github.io/awe/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first two authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the Moral Beliefs Encoded in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nino Scherrer, Claudia Shi, Amir Feder, David M. Blei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a case study on the design, administration,
+post-processing, and evaluation of surveys on large language models (LLMs). It
+comprises two components: (1) A statistical method for eliciting beliefs
+encoded in LLMs. We introduce statistical measures and evaluation metrics that
+quantify the probability of an LLM "making a choice", the associated
+uncertainty, and the consistency of that choice. (2) We apply this method to
+study what moral beliefs are encoded in different LLMs, especially in ambiguous
+cases where the right choice is not obvious. We design a large-scale survey
+comprising 680 high-ambiguity moral scenarios (e.g., "Should I tell a white
+lie?") and 687 low-ambiguity moral scenarios (e.g., "Should I stop for a
+pedestrian on the road?"). Each scenario includes a description, two possible
+actions, and auxiliary labels indicating violated rules (e.g., "do not kill").
+We administer the survey to 28 open- and closed-source LLMs. We find that (a)
+in unambiguous scenarios, most models "choose" actions that align with
+commonsense. In ambiguous cases, most models express uncertainty. (b) Some
+models are uncertain about choosing the commonsense action because their
+responses are sensitive to the question-wording. (c) Some models reflect clear
+preferences in ambiguous scenarios. Specifically, closed-source models tend to
+agree with each other.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement Learning by Guided Safe Exploration <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qisong Yang, Thiago D. Simão, Nils Jansen, Simon H. Tindemans, Matthijs T. J. Spaan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safety is critical to broadening the application of reinforcement learning
+(RL). Often, we train RL agents in a controlled environment, such as a
+laboratory, before deploying them in the real world. However, the real-world
+target task might be unknown prior to deployment. Reward-free RL trains an
+agent without the reward to adapt quickly once the reward is revealed. We
+consider the constrained reward-free setting, where an agent (the guide) learns
+to explore safely without the reward signal. This agent is trained in a
+controlled environment, which allows unsafe interactions and still provides the
+safety signal. After the target task is revealed, safety violations are not
+allowed anymore. Thus, the guide is leveraged to compose a safe behaviour
+policy. Drawing from transfer learning, we also regularize a target policy (the
+student) towards the guide while the student is unreliable and gradually
+eliminate the influence of the guide as training progresses. The empirical
+analysis shows that this method can achieve safe transfer learning and helps
+the student solve the target task faster.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accecpted at ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Constraint Enforcement Deep Reinforcement Learning Framework for
+  Optimal Energy Storage Systems Dispatch 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengren Hou, Edgar Mauricio Salazar Duque, Peter Palensky, Pedro P. Vergara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The optimal dispatch of energy storage systems (ESSs) presents formidable
+challenges due to the uncertainty introduced by fluctuations in dynamic prices,
+demand consumption, and renewable-based energy generation. By exploiting the
+generalization capabilities of deep neural networks (DNNs), deep reinforcement
+learning (DRL) algorithms can learn good-quality control models that adaptively
+respond to distribution networks' stochastic nature. However, current DRL
+algorithms lack the capabilities to enforce operational constraints strictly,
+often even providing unfeasible control actions. To address this issue, we
+propose a DRL framework that effectively handles continuous action spaces while
+strictly enforcing the environments and action space operational constraints
+during online operation. Firstly, the proposed framework trains an action-value
+function modeled using DNNs. Subsequently, this action-value function is
+formulated as a mixed-integer programming (MIP) formulation enabling the
+consideration of the environment's operational constraints. Comprehensive
+numerical simulations show the superior performance of the proposed MIP-DRL
+framework, effectively enforcing all constraints while delivering high-quality
+dispatch decisions when compared with state-of-the-art DRL algorithms and the
+optimal solution obtained with a perfect forecast of the stochastic variables.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been submitted to a publication in a journal. This
+  corresponds to the submitted version. After acceptance, it may be removed
+  depending on the journal's requirements for copyright</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chat<span class="highlight-title">GPT</span> and Persuasive Technologies for the Management and Delivery of
+  Personalized Recommendations in Hotel Hospitality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manolis Remountakis, Konstantinos Kotis, Babis Kourtzis, George E. Tsekouras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems have become indispensable tools in the hotel hospitality
+industry, enabling personalized and tailored experiences for guests. Recent
+advancements in large language models (LLMs), such as ChatGPT, and persuasive
+technologies, have opened new avenues for enhancing the effectiveness of those
+systems. This paper explores the potential of integrating ChatGPT and
+persuasive technologies for automating and improving hotel hospitality
+recommender systems. First, we delve into the capabilities of ChatGPT, which
+can understand and generate human-like text, enabling more accurate and
+context-aware recommendations. We discuss the integration of ChatGPT into
+recommender systems, highlighting the ability to analyze user preferences,
+extract valuable insights from online reviews, and generate personalized
+recommendations based on guest profiles. Second, we investigate the role of
+persuasive technology in influencing user behavior and enhancing the persuasive
+impact of hotel recommendations. By incorporating persuasive techniques, such
+as social proof, scarcity and personalization, recommender systems can
+effectively influence user decision-making and encourage desired actions, such
+as booking a specific hotel or upgrading their room. To investigate the
+efficacy of ChatGPT and persuasive technologies, we present a pilot experi-ment
+with a case study involving a hotel recommender system. We aim to study the
+impact of integrating ChatGPT and persua-sive techniques on user engagement,
+satisfaction, and conversion rates. The preliminary results demonstrate the
+potential of these technologies in enhancing the overall guest experience and
+business performance. Overall, this paper contributes to the field of hotel
+hospitality by exploring the synergistic relationship between LLMs and
+persuasive technology in recommender systems, ultimately influencing guest
+satisfaction and hotel revenue.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unraveling the Complexity of Splitting Sequential Data: Tackling
+  Challenges in Video and Time Series Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14294v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14294v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diego Botache, Kristina Dingel, Rico Huhnstock, Arno Ehresmann, Bernhard Sick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Splitting of sequential data, such as videos and time series, is an essential
+step in various data analysis tasks, including object tracking and anomaly
+detection. However, splitting sequential data presents a variety of challenges
+that can impact the accuracy and reliability of subsequent analyses. This
+concept article examines the challenges associated with splitting sequential
+data, including data acquisition, data representation, split ratio selection,
+setting up quality criteria, and choosing suitable selection strategies. We
+explore these challenges through two real-world examples: motor test benches
+and particle tracking in liquids.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ General Purpose Artificial Intelligence Systems (GPAIS): Properties,
+  Definition, Taxonomy, Open Challenges and Implications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14283v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14283v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isaac Triguero, Daniel Molina, Javier Poyatos, Javier Del Ser, Francisco Herrera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most applications of Artificial Intelligence (AI) are designed for a confined
+and specific task. However, there are many scenarios that call for a more
+general AI, capable of solving a wide array of tasks without being specifically
+designed for them. The term General-Purpose Artificial Intelligence Systems
+(GPAIS) has been defined to refer to these AI systems. To date, the possibility
+of an Artificial General Intelligence, powerful enough to perform any
+intellectual task as if it were human, or even improve it, has remained an
+aspiration, fiction, and considered a risk for our society. Whilst we might
+still be far from achieving that, GPAIS is a reality and sitting at the
+forefront of AI research.
+  This work discusses existing definitions for GPAIS and proposes a new
+definition that allows for a gradual differentiation among types of GPAIS
+according to their properties and limitations. We distinguish between
+closed-world and open-world GPAIS, characterising their degree of autonomy and
+ability based on several factors such as adaptation to new tasks, competence in
+domains not intentionally trained for, ability to learn from few data, or
+proactive acknowledgment of their own limitations. We then propose a taxonomy
+of approaches to realise GPAIS, describing research trends such as the use of
+AI techniques to improve another AI or foundation models. As a prime example,
+we delve into generative AI, aligning them with the terms and concepts
+presented in the taxonomy. Through the proposed definition and taxonomy, our
+aim is to facilitate research collaboration across different areas that are
+tackling general-purpose tasks, as they share many common aspects. Finally, we
+discuss the current state of GPAIS, its challenges and prospects, implications
+for our society, and the need for responsible and trustworthy AI systems and
+regulation, with the goal of providing a holistic view of GPAIS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deepfake Image Generation for Improved Brain Tumor Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roa'a Al-Emaryeen, Sara Al-Nahhas, Fatima Himour, Waleed Mahafza, Omar Al-Kadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the world progresses in technology and health, awareness of disease by
+revealing asymptomatic signs improves. It is important to detect and treat
+tumors in early stage as it can be life-threatening. Computer-aided
+technologies are used to overcome lingering limitations facing disease
+diagnosis, while brain tumor segmentation remains a difficult process,
+especially when multi-modality data is involved. This is mainly attributed to
+ineffective training due to lack of data and corresponding labelling. This work
+investigates the feasibility of employing deep-fake image generation for
+effective brain tumor segmentation. To this end, a Generative Adversarial
+Network was used for image-to-image translation for increasing dataset size,
+followed by image segmentation using a U-Net-based convolutional neural network
+trained with deepfake images. Performance of the proposed approach is compared
+with ground truth of four publicly available datasets. Results show improved
+performance in terms of image segmentation quality metrics, and could
+potentially assist when training with limited data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 8 figures, 2 tables, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fluorescent Neuronal Cells v2: Multi-Task, Multi-Format Annotations for
+  Deep Learning in Microscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14243v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14243v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Clissa, Antonio Macaluso, Roberto Morelli, Alessandra Occhinegro, Emiliana Piscitiello, Ludovico Taddei, Marco Luppi, Roberto Amici, Matteo Cerri, Timna Hitrec, Lorenzo Rinaldi, Antonio Zoccoli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fluorescent Neuronal Cells v2 is a collection of fluorescence microscopy
+images and the corresponding ground-truth annotations, designed to foster
+innovative research in the domains of Life Sciences and Deep Learning. This
+dataset encompasses three image collections in which rodent neuronal cells'
+nuclei and cytoplasm are stained with diverse markers to highlight their
+anatomical or functional characteristics. Alongside the images, we provide
+ground-truth annotations for several learning tasks, including semantic
+segmentation, object detection, and counting. The contribution is two-fold.
+First, given the variety of annotations and their accessible formats, we
+envision our work facilitating methodological advancements in computer vision
+approaches for segmentation, detection, feature learning, unsupervised and
+self-supervised learning, transfer learning, and related areas. Second, by
+enabling extensive exploration and benchmarking, we hope Fluorescent Neuronal
+Cells v2 will catalyze breakthroughs in fluorescence microscopy analysis and
+promote cutting-edge discoveries in life sciences. The data are available at:
+https://amsacta.unibo.it/id/eprint/7347
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages; 5 figures; 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evolving Multi-Objective Neural Network Controllers for Robot Swarms <span class="chip">AAMAS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14237v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14237v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karl Mason, Sabine Hauert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many swarm robotics tasks consist of multiple conflicting objectives. This
+research proposes a multi-objective evolutionary neural network approach to
+developing controllers for swarms of robots. The swarm robot controllers are
+trained in a low-fidelity Python simulator and then tested in a high-fidelity
+simulated environment using Webots. Simulations are then conducted to test the
+scalability of the evolved multi-objective robot controllers to environments
+with a larger number of robots. The results presented demonstrate that the
+proposed approach can effectively control each of the robots. The robot swarm
+exhibits different behaviours as the weighting for each objective is adjusted.
+The results also confirm that multi-objective neural network controllers
+evolved in a low-fidelity simulator can be transferred to high-fidelity
+simulated environments and that the controllers can scale to environments with
+a larger number of robots without further retraining needed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was presented at the 2023 Autonomous Robots and Multirobot
+  Systems (ARMS) Workshop, at The 22nd International Conference on Autonomous
+  Agents and Multiagent Systems (AAMAS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models are Competitive Near Cold-start Recommenders for
+  Language- and Item-based Preferences <span class="chip">RecSys'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Scott Sanner, Krisztian Balog, Filip Radlinski, Ben Wedin, Lucas Dixon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional recommender systems leverage users' item preference history to
+recommend novel content that users may like. However, modern dialog interfaces
+that allow users to express language-based preferences offer a fundamentally
+different modality for preference input. Inspired by recent successes of
+prompting paradigms for large language models (LLMs), we study their use for
+making recommendations from both item-based and language-based preferences in
+comparison to state-of-the-art item-based collaborative filtering (CF) methods.
+To support this investigation, we collect a new dataset consisting of both
+item-based and language-based preferences elicited from users along with their
+ratings on a variety of (biased) recommended items and (unbiased) random items.
+Among numerous experimental results, we find that LLMs provide competitive
+recommendation performance for pure language-based preferences (no item
+preferences) in the near cold-start case in comparison to item-based CF
+methods, despite having no supervised training for this specific task
+(zero-shot) or only a few labels (few-shot). This is particularly promising as
+language-based preference representations are more explainable and scrutable
+than item-based or vector-based representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at RecSys'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Modeling and Monitoring of Dependent Processes under Resource
+  Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14208v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14208v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanapol Kosolwattana, Huazheng Wang, Ying Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monitoring a population of dependent processes under limited resources is
+critical for abnormal events detection. A novel online collaborative learning
+method is proposed to adaptively allocate the resources for exploitation of
+high-risk processes and exploration of dependent dynamics. Efficiency of the
+proposed method is proved through theoretical analysis and experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Application of Random Forest and Support Vector Machine for
+  Investigation of Pressure Filtration Performance, a Zinc Plant Filter Cake
+  Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masoume Kazemi, Davood Moradkhani, Alireza Abbas Alipour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The hydrometallurgical method of zinc production involves leaching zinc from
+ore and then separating the solid residue from the liquid solution by pressure
+filtration. This separation process is very important since the solid residue
+contains some moisture that can reduce the amount of zinc recovered. This study
+modeled the pressure filtration process through Random Forest (RF) and Support
+Vector Machine (SVM). The models take continuous variables (extracted features)
+from the lab samples as inputs. Thus, regression models namely Random Forest
+Regression (RFR) and Support Vector Regression (SVR) were chosen. A total
+dataset was obtained during the pressure filtration process in two conditions:
+1) Polypropylene (S1) and 2) Polyester fabrics (S2). To predict the cake
+moisture, solids concentration (0.2 and 0.38), temperature (35 and 65
+centigrade), pH (2, 3.5, and 5), pressure, cake thickness (14, 20, 26, and 34
+mm), air-blow time (2, 10 and 15 min) and filtration time were applied as input
+variables. The models' predictive accuracy was evaluated by the coefficient of
+determination (R2) parameter. The results revealed that the RFR model is
+superior to the SVR model for cake moisture prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Learning of Discrete-Continuous Computation Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14193v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14193v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Friede, Mathias Niepert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerous models for supervised and reinforcement learning benefit from
+combinations of discrete and continuous model components. End-to-end learnable
+discrete-continuous models are compositional, tend to generalize better, and
+are more interpretable. A popular approach to building discrete-continuous
+computation graphs is that of integrating discrete probability distributions
+into neural networks using stochastic softmax tricks. Prior work has mainly
+focused on computation graphs with a single discrete component on each of the
+graph's execution paths. We analyze the behavior of more complex stochastic
+computations graphs with multiple sequential discrete components. We show that
+it is challenging to optimize the parameters of these models, mainly due to
+small gradients and local minima. We then propose two new strategies to
+overcome these challenges. First, we show that increasing the scale parameter
+of the Gumbel noise perturbations during training improves the learning
+behavior. Second, we propose dropout residual connections specifically tailored
+to stochastic, discrete-continuous computation graphs. With an extensive set of
+experiments, we show that we can train complex discrete-continuous models which
+one cannot train with standard stochastic softmax tricks. We also show that
+complex discrete-stochastic models generalize better than their continuous
+counterparts on several benchmark datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A comparison of machine learning surrogate models of street-scale
+  flooding in Norfolk, Virginia 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14185v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14185v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diana McSpadden, Steven Goldenberg, Binata Roy, Malachi Schram, Jonathan L. Goodall, Heather Richter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-lying coastal cities, exemplified by Norfolk, Virginia, face the
+challenge of street flooding caused by rainfall and tides, which strain
+transportation and sewer systems and can lead to property damage. While
+high-fidelity, physics-based simulations provide accurate predictions of urban
+pluvial flooding, their computational complexity renders them unsuitable for
+real-time applications. Using data from Norfolk rainfall events between 2016
+and 2018, this study compares the performance of a previous surrogate model
+based on a random forest algorithm with two deep learning models: Long
+Short-Term Memory (LSTM) and Gated Recurrent Unit (GRU). This investigation
+underscores the importance of using a model architecture that supports the
+communication of prediction uncertainty and the effective integration of
+relevant, multi-modal features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Disentangled Discrete Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Friede, Christian Reimers, Heiner Stuckenschmidt, Mathias Niepert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent successes in image generation, model-based reinforcement learning, and
+text-to-image generation have demonstrated the empirical advantages of discrete
+latent representations, although the reasons behind their benefits remain
+unclear. We explore the relationship between discrete latent spaces and
+disentangled representations by replacing the standard Gaussian variational
+autoencoder (VAE) with a tailored categorical variational autoencoder. We show
+that the underlying grid structure of categorical distributions mitigates the
+problem of rotational invariance associated with multivariate Gaussian
+distributions, acting as an efficient inductive prior for disentangled
+representations. We provide both analytical and empirical findings that
+demonstrate the advantages of discrete VAEs for learning disentangled
+representations. Furthermore, we introduce the first unsupervised model
+selection strategy that favors disentangled representations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Design of Synthetic Active Inference Agents by Mere Mortals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14145v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14145v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bert de Vries
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The theoretical properties of active inference agents are impressive, but how
+do we realize effective agents in working hardware and software on edge
+devices? This is an interesting problem because the computational load for
+policy exploration explodes exponentially, while the computational resources
+are very limited for edge devices. In this paper, we discuss the necessary
+features for a software toolbox that supports a competent non-expert engineer
+to develop working active inference agents. We introduce a toolbox-in-progress
+that aims to accelerate the democratization of active inference agents in a
+similar way as TensorFlow propelled applications of deep learning technology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Piecewise-Stationary Combinatorial Semi-Bandit with Causally Related
+  Rewards 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14138v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14138v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Behzad Nourani-Koliji, Steven Bilaj, Amir Rezaei Balef, Setareh Maghsudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the piecewise stationary combinatorial semi-bandit problem with
+causally related rewards. In our nonstationary environment, variations in the
+base arms' distributions, causal relationships between rewards, or both, change
+the reward generation process. In such an environment, an optimal
+decision-maker must follow both sources of change and adapt accordingly. The
+problem becomes aggravated in the combinatorial semi-bandit setting, where the
+decision-maker only observes the outcome of the selected bundle of arms. The
+core of our proposed policy is the Upper Confidence Bound (UCB) algorithm. We
+assume the agent relies on an adaptive approach to overcome the challenge. More
+specifically, it employs a change-point detector based on the Generalized
+Likelihood Ratio (GLR) test. Besides, we introduce the notion of group restart
+as a new alternative restarting strategy in the decision making process in
+structured environments. Finally, our algorithm integrates a mechanism to trace
+the variations of the underlying graph structure, which captures the causal
+relationships between the rewards in the bandit setting. Theoretically, we
+establish a regret upper bound that reflects the effects of the number of
+structural- and distribution changes on the performance. The outcome of our
+numerical experiments in real-world scenarios exhibits applicability and
+superior performance of our proposal compared to the state-of-the-art
+benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Developing and Evaluating Tiny to Medium-Sized Turkish <span class="highlight-title">BERT</span> Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14134v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14134v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Himmet Toprak Kesgin, Muzaffer Kaan Yuce, Mehmet Fatih Amasyali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces and evaluates tiny, mini, small, and medium-sized
+uncased Turkish BERT models, aiming to bridge the research gap in
+less-resourced languages. We trained these models on a diverse dataset
+encompassing over 75GB of text from multiple sources and tested them on several
+tasks, including mask prediction, sentiment analysis, news classification, and,
+zero-shot classification. Despite their smaller size, our models exhibited
+robust performance, including zero-shot task, while ensuring computational
+efficiency and faster execution times. Our findings provide valuable insights
+into the development and application of smaller language models, especially in
+the context of the Turkish language.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GraphRNN Revisited: An Ablation Study and Extensions for Directed
+  Acyclic Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14109v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14109v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taniya Das, Mark Koch, Maya Ravichandran, Nikhil Khatri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  GraphRNN is a deep learning-based architecture proposed by You et al. for
+learning generative models for graphs. We replicate the results of You et al.
+using a reproduced implementation of the GraphRNN architecture and evaluate
+this against baseline models using new metrics. Through an ablation study, we
+find that the BFS traversal suggested by You et al. to collapse representations
+of isomorphic graphs contributes significantly to model performance.
+Additionally, we extend GraphRNN to generate directed acyclic graphs by
+replacing the BFS traversal with a topological sort. We demonstrate that this
+method improves significantly over a directed-multiclass variant of GraphRNN on
+a real-world dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Actions Speak What You Want: Provably Sample-Efficient Reinforcement
+  Learning of the Quantal Stackelberg Equilibrium from Strategic Feedbacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Chen, Mengdi Wang, Zhuoran Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study reinforcement learning (RL) for learning a Quantal Stackelberg
+Equilibrium (QSE) in an episodic Markov game with a leader-follower structure.
+In specific, at the outset of the game, the leader announces her policy to the
+follower and commits to it. The follower observes the leader's policy and, in
+turn, adopts a quantal response policy by solving an entropy-regularized policy
+optimization problem induced by leader's policy. The goal of the leader is to
+find her optimal policy, which yields the optimal expected total return, by
+interacting with the follower and learning from data. A key challenge of this
+problem is that the leader cannot observe the follower's reward, and needs to
+infer the follower's quantal response model from his actions against leader's
+policies. We propose sample-efficient algorithms for both the online and
+offline settings, in the context of function approximation. Our algorithms are
+based on (i) learning the quantal response model via maximum likelihood
+estimation and (ii) model-free or model-based RL for solving the leader's
+decision making problem, and we show that they achieve sublinear regret upper
+bounds. Moreover, we quantify the uncertainty of these estimators and leverage
+the uncertainty to implement optimistic and pessimistic algorithms for online
+and offline settings. Besides, when specialized to the linear and myopic
+setting, our algorithms are also computationally efficient. Our theoretical
+analysis features a novel performance-difference lemma which incorporates the
+error of quantal response model, which might be of independent interest.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>129 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Domain Discrepancy Adjustment for Active Multi-Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14068v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14068v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Liu, Bo Zhou, Zhipeng Zhao, Zening Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-source unsupervised domain adaptation (MUDA) aims to transfer knowledge
+from related source domains to an unlabeled target domain. While recent MUDA
+methods have shown promising results, most focus on aligning the overall
+feature distributions across source domains, which can lead to negative effects
+due to redundant features within each domain. Moreover, there is a significant
+performance gap between MUDA and supervised methods. To address these
+challenges, we propose a novel approach called Dynamic Domain Discrepancy
+Adjustment for Active Multi-Domain Adaptation (D3AAMDA). Firstly, we establish
+a multi-source dynamic modulation mechanism during the training process based
+on the degree of distribution differences between source and target domains.
+This mechanism controls the alignment level of features between each source
+domain and the target domain, effectively leveraging the local advantageous
+feature information within the source domains. Additionally, we propose a
+Multi-source Active Boundary Sample Selection (MABS) strategy, which utilizes a
+guided dynamic boundary loss to design an efficient query function for
+selecting important samples. This strategy achieves improved generalization to
+the target domain with minimal sampling costs. We extensively evaluate our
+proposed method on commonly used domain adaptation datasets, comparing it
+against existing UDA and ADA methods. The experimental results unequivocally
+demonstrate the superiority of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine Learning Applications In Healthcare: The State Of Knowledge and
+  Future Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mrinmoy Roy, Sarwar J. Minar, Porarthi Dhar, A T M Omor Faruq
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detection of easily missed hidden patterns with fast processing power makes
+machine learning (ML) indispensable to today's healthcare system. Though many
+ML applications have already been discovered and many are still under
+investigation, only a few have been adopted by current healthcare systems. As a
+result, there exists an enormous opportunity in healthcare system for ML but
+distributed information, scarcity of properly arranged and easily explainable
+documentation in related sector are major impede which are making ML
+applications difficult to healthcare professionals. This study aimed to gather
+ML applications in different areas of healthcare concisely and more effectively
+so that necessary information can be accessed immediately with relevant
+references. We divided our study into five major groups: community level work,
+risk management/ preventive care, healthcare operation management, remote care,
+and early detection. Dividing these groups into subgroups, we provided relevant
+references with description in tabular form for quick access. Our objective is
+to inform people about ML applicability in healthcare industry, reduce the
+knowledge gap of clinicians about the ML applications and motivate healthcare
+professionals towards more machine learning based healthcare system.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pre-Train</span>ing with Diffusion models for Dental Radiography segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14066v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14066v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jérémy Rousseau, Christian Alaka, Emma Covili, Hippolyte Mayard, Laura Misrachi, Willy Au
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical radiography segmentation, and specifically dental radiography, is
+highly limited by the cost of labeling which requires specific expertise and
+labor-intensive annotations. In this work, we propose a straightforward
+pre-training method for semantic segmentation leveraging Denoising Diffusion
+Probabilistic Models (DDPM), which have shown impressive results for generative
+modeling. Our straightforward approach achieves remarkable performance in terms
+of label efficiency and does not require architectural modifications between
+pre-training and downstream tasks. We propose to first pre-train a Unet by
+exploiting the DDPM training objective, and then fine-tune the resulting model
+on a segmentation task. Our experimental results on the segmentation of dental
+radiographs demonstrate that the proposed method is competitive with
+state-of-the-art pre-training methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures, Deep Generative Models workshop @ MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topologically-Regularized Multiple Instance Learning for Red Blood Cell
+  Disease Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Salome Kazeminia, Ario Sadafi, Asya Makhro, Anna Bogdanova, Carsten Marr, Bastian Rieck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diagnosing rare anemia disorders using microscopic images is challenging for
+skilled specialists and machine-learning methods alike. Due to thousands of
+disease-relevant cells in a single blood sample, this constitutes a complex
+multiple-instance learning (MIL) problem. While the spatial neighborhood of red
+blood cells is not meaningful per se, the topology, i.e., the geometry of blood
+samples as a whole, contains informative features to remedy typical MIL issues,
+such as vanishing gradients and overfitting when training on limited data. We
+thus develop a topology-based approach that extracts multi-scale topological
+features from bags of single red blood cell images. The topological features
+are used to regularize the model, enforcing the preservation of characteristic
+topological properties of the data. Applied to a dataset of 71 patients
+suffering from rare anemia disorders with 521 microscopic images of red blood
+cells, our experiments show that topological regularization is an effective
+method that leads to more than 3% performance improvements for the automated
+classification of rare anemia disorders based on single-cell images. This is
+the first approach that uses topological properties for regularizing the MIL
+process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Are <span class="highlight-title">Transformer</span>s with One Layer Self-Attention Using Low-Rank Weight
+  Matrices Universal Approximators? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14023v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14023v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tokio Kajitsuka, Issei Sato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing analyses of the expressive capacity of Transformer models have
+required excessively deep layers for data memorization, leading to a
+discrepancy with the Transformers actually used in practice. This is primarily
+due to the interpretation of the softmax function as an approximation of the
+hardmax function. By clarifying the connection between the softmax function and
+the Boltzmann operator, we prove that a single layer of self-attention with
+low-rank weight matrices possesses the capability to perfectly capture the
+context of an entire input sequence. As a consequence, we show that
+single-layer Transformer has a memorization capacity for finite samples, and
+that Transformers consisting of one self-attention layer with two feed-forward
+neural networks are universal approximators for continuous functions on a
+compact domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MCMC-Correction of Score-Based Diffusion Models for Model Composition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14012v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14012v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anders Sjöberg, Jakob Lindqvist, Magnus Önnheim, Mats Jirstrand, Lennart Svensson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models can be parameterised in terms of either a score or an energy
+function. The energy parameterisation has better theoretical properties, mainly
+that it enables an extended sampling procedure with a Metropolis--Hastings
+correction step, based on the change in total energy in the proposed samples.
+However, it seems to yield slightly worse performance, and more importantly,
+due to the widespread popularity of score-based diffusion, there are limited
+availability of off-the-shelf pre-trained energy-based ones. This limitation
+undermines the purpose of model composition, which aims to combine pre-trained
+models to sample from new distributions. Our proposal, however, suggests
+retaining the score parameterization and instead computing the energy-based
+acceptance probability through line integration of the score function. This
+allows us to re-use existing diffusion models and still combine the reverse
+process with various Markov-Chain Monte Carlo (MCMC) methods. We evaluate our
+method on a 2D experiment and find that it achieve similar or arguably better
+performance than the energy parameterisation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast algorithms for k-submodular maximization subject to a matroid
+  constraint 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13996v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13996v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuxian Niu, Qian Liu, Yang Zhou, Min Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we apply a Threshold-Decreasing Algorithm to maximize
+$k$-submodular functions under a matroid constraint, which reduces the query
+complexity of the algorithm compared to the greedy algorithm with little loss
+in approximation ratio. We give a $(\frac{1}{2} - \epsilon)$-approximation
+algorithm for monotone $k$-submodular function maximization, and a
+$(\frac{1}{3} - \epsilon)$-approximation algorithm for non-monotone case, with
+complexity $O(\frac{n(k\cdot EO + IO)}{\epsilon} \log \frac{r}{\epsilon})$,
+where $r$ denotes the rank of the matroid, and $IO, EO$ denote the number of
+oracles to evaluate whether a subset is an independent set and to compute the
+function value of $f$, respectively. Since the constraint of total size can be
+looked as a special matroid, called uniform matroid, then we present the fast
+algorithm for maximizing $k$-submodular functions subject to a total size
+constraint as corollaries. corollaries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Take Your Pick: Enabling Effective Personalized Federated Learning
+  within Low-dimensional Feature Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guogang Zhu, Xuefeng Liu, Shaojie Tang, Jianwei Niu, Xinghao Wu, Jiaxing Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized federated learning (PFL) is a popular framework that allows
+clients to have different models to address application scenarios where
+clients' data are in different domains. The typical model of a client in PFL
+features a global encoder trained by all clients to extract universal features
+from the raw data and personalized layers (e.g., a classifier) trained using
+the client's local data. Nonetheless, due to the differences between the data
+distributions of different clients (aka, domain gaps), the universal features
+produced by the global encoder largely encompass numerous components irrelevant
+to a certain client's local task. Some recent PFL methods address the above
+problem by personalizing specific parameters within the encoder. However, these
+methods encounter substantial challenges attributed to the high dimensionality
+and non-linearity of neural network parameter space. In contrast, the feature
+space exhibits a lower dimensionality, providing greater intuitiveness and
+interpretability as compared to the parameter space. To this end, we propose a
+novel PFL framework named FedPick. FedPick achieves PFL in the low-dimensional
+feature space by selecting task-relevant features adaptively for each client
+from the features generated by the global encoder based on its local data
+distribution. It presents a more accessible and interpretable implementation of
+PFL compared to those methods working in the parameter space. Extensive
+experimental results show that FedPick could effectively select task-relevant
+features for each client and improve model performance in cross-domain FL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BovineTalk: Machine Learning for Vocalization Analysis of Dairy Cattle
+  under Negative Affective States 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dinu Gavojdian, Teddy Lazebnik, Madalina Mincu, Ariel Oren, Ioana Nicolae, Anna Zamansky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a critical need to develop and validate non-invasive animal-based
+indicators of affective states in livestock species, in order to integrate them
+into on-farm assessment protocols, potentially via the use of precision
+livestock farming (PLF) tools. One such promising approach is the use of vocal
+indicators. The acoustic structure of vocalizations and their functions were
+extensively studied in important livestock species, such as pigs, horses,
+poultry and goats, yet cattle remain understudied in this context to date. Cows
+were shown to produce two types vocalizations: low-frequency calls (LF),
+produced with the mouth closed, or partially closed, for close distance
+contacts and open mouth emitted high-frequency calls (HF), produced for long
+distance communication, with the latter considered to be largely associated
+with negative affective states. Moreover, cattle vocalizations were shown to
+contain information on individuality across a wide range of contexts, both
+negative and positive. Nowadays, dairy cows are facing a series of negative
+challenges and stressors in a typical production cycle, making vocalizations
+during negative affective states of special interest for research. One
+contribution of this study is providing the largest to date pre-processed
+(clean from noises) dataset of lactating adult multiparous dairy cows during
+negative affective states induced by visual isolation challenges. Here we
+present two computational frameworks - deep learning based and explainable
+machine learning based, to classify high and low-frequency cattle calls, and
+individual cow voice recognition. Our models in these two frameworks reached
+87.2% and 89.4% accuracy for LF and HF classification, with 68.9% and 72.5%
+accuracy rates for the cow individual identification, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ METAVerse: Meta-Learning Traversability Cost Map for Off-Road Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junwon Seo, Taekyung Kim, Seongyong Ahn, Kiho Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous navigation in off-road conditions requires an accurate estimation
+of terrain traversability. However, traversability estimation in unstructured
+environments is subject to high uncertainty due to the variability of numerous
+factors that influence vehicle-terrain interaction. Consequently, it is
+challenging to obtain a generalizable model that can accurately predict
+traversability in a variety of environments. This paper presents METAVerse, a
+meta-learning framework for learning a global model that accurately and
+reliably predicts terrain traversability across diverse environments. We train
+the traversability prediction network to generate a dense and continuous-valued
+cost map from a sparse LiDAR point cloud, leveraging vehicle-terrain
+interaction feedback in a self-supervised manner. Meta-learning is utilized to
+train a global model with driving data collected from multiple environments,
+effectively minimizing estimation uncertainty. During deployment, online
+adaptation is performed to rapidly adapt the network to the local environment
+by exploiting recent interaction experiences. To conduct a comprehensive
+evaluation, we collect driving data from various terrains and demonstrate that
+our method can obtain a global model that minimizes uncertainty. Moreover, by
+integrating our model with a model predictive controller, we demonstrate that
+the reduced uncertainty results in safe and stable navigation in unstructured
+and unknown terrains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our video can be found at https://youtu.be/4rIAMM1ZKMo</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ This is not correct! Negation-aware Evaluation of Language Generation
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13989v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13989v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miriam Anschütz, Diego Miguel Lozano, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models underestimate the impact of negations on how much they
+change the meaning of a sentence. Therefore, learned evaluation metrics based
+on these models are insensitive to negations. In this paper, we propose
+NegBLEURT, a negation-aware version of the BLEURT evaluation metric. For that,
+we designed a rule-based sentence negation tool and used it to create the
+CANNOT negation evaluation dataset. Based on this dataset, we fine-tuned a
+sentence transformer and an evaluation metric to improve their negation
+sensitivity. Evaluating these models on existing benchmarks shows that our
+fine-tuned models outperform existing metrics on the negated sentences by far
+while preserving their base models' performances on other perturbations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INLG 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Controlling the Latent Space of GANs through Reinforcement Learning: A
+  Case Study on Task-based Image-to-Image Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahyar Abbasian, Taha Rajabzadeh, Ahmadreza Moradipari, Seyed Amir Hossein Aqajari, Hongsheng Lu, Amir Rahmani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GAN) have emerged as a formidable AI tool to
+generate realistic outputs based on training datasets. However, the challenge
+of exerting control over the generation process of GANs remains a significant
+hurdle. In this paper, we propose a novel methodology to address this issue by
+integrating a reinforcement learning (RL) agent with a latent-space GAN
+(l-GAN), thereby facilitating the generation of desired outputs. More
+specifically, we have developed an actor-critic RL agent with a meticulously
+designed reward policy, enabling it to acquire proficiency in navigating the
+latent space of the l-GAN and generating outputs based on specified tasks. To
+substantiate the efficacy of our approach, we have conducted a series of
+experiments employing the MNIST dataset, including arithmetic addition as an
+illustrative task. The outcomes of these experiments serve to validate our
+methodology. Our pioneering integration of an RL agent with a GAN model
+represents a novel advancement, holding great potential for enhancing
+generative networks in the future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 7 figures, 2 tables, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Deep Neural Networks via Linear Separability of Hidden
+  Layers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13962v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13962v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Zhang, Xinyu Chen, Wensheng Li, Lixue Liu, Wei Wu, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we measure the linear separability of hidden layer outputs to
+study the characteristics of deep neural networks. In particular, we first
+propose Minkowski difference based linear separability measures (MD-LSMs) to
+evaluate the linear separability degree of two points sets. Then, we
+demonstrate that there is a synchronicity between the linear separability
+degree of hidden layer outputs and the network training performance, i.e., if
+the updated weights can enhance the linear separability degree of hidden layer
+outputs, the updated network will achieve a better training performance, and
+vice versa. Moreover, we study the effect of activation function and network
+size (including width and depth) on the linear separability of hidden layers.
+Finally, we conduct the numerical experiments to validate our findings on some
+popular deep networks including multilayer perceptron (MLP), convolutional
+neural network (CNN), deep belief network (DBN), ResNet, VGGNet, AlexNet,
+vision transformer (ViT) and GoogLeNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Entropy Neural Estimation for Graph Contrastive Learning <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixuan Ma, Xiaolin Zhang, Peng Zhang, Kun Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning on graphs aims at extracting distinguishable high-level
+representations of nodes. In this paper, we theoretically illustrate that the
+entropy of a dataset can be approximated by maximizing the lower bound of the
+mutual information across different views of a graph, \ie, entropy is estimated
+by a neural network. Based on this finding, we propose a simple yet effective
+subset sampling strategy to contrast pairwise representations between views of
+a dataset. In particular, we randomly sample nodes and edges from a given graph
+to build the input subset for a view. Two views are fed into a parameter-shared
+Siamese network to extract the high-dimensional embeddings and estimate the
+information entropy of the entire graph. For the learning process, we propose
+to optimize the network using two objectives, simultaneously. Concretely, the
+input of the contrastive loss function consists of positive and negative pairs.
+Our selection strategy of pairs is different from previous works and we present
+a novel strategy to enhance the representation ability of the graph encoder by
+selecting nodes based on cross-view similarities. We enrich the diversity of
+the positive and negative pairs by selecting highly similar samples and totally
+different data with the guidance of cross-view similarity scores, respectively.
+We also introduce a cross-view consistency constraint on the representations
+generated from the different views. This objective guarantees the learned
+representations are consistent across views from the perspective of the entire
+graph. We conduct extensive experiments on seven graph benchmarks, and the
+proposed approach achieves competitive performance compared to the current
+state-of-the-art methods. The source code will be publicly released once this
+paper is accepted.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023 accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topology-aware Robust Optimization for Out-of-distribution
+  Generalization <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengchun Qiao, Xi Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) generalization is a challenging machine learning
+problem yet highly desirable in many high-stake applications. Existing methods
+suffer from overly pessimistic modeling with low generalization confidence. As
+generalizing to arbitrary test distributions is impossible, we hypothesize that
+further structure on the topology of distributions is crucial in developing
+strong OOD resilience. To this end, we propose topology-aware robust
+optimization (TRO) that seamlessly integrates distributional topology in a
+principled optimization framework. More specifically, TRO solves two
+optimization objectives: (1) Topology Learning which explores data manifold to
+uncover the distributional topology; (2) Learning on Topology which exploits
+the topology to constrain robust optimization for tightly-bounded
+generalization risks. We theoretically demonstrate the effectiveness of our
+approach and empirically show that it significantly outperforms the state of
+the arts in a wide range of tasks including classification, regression, and
+semantic segmentation. Moreover, we empirically find the data-driven
+distributional topology is consistent with domain knowledge, enhancing the
+explainability of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In ICLR 2023 (17 pages including appendix). The source code and
+  pre-trained models are publicly available at: https://github.com/joffery/TRO</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Semi-Supervised Semantic Segmentation with Dual-Level Siamese
+  Structure Network <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhibo Tain, Xiaolin Zhang, Peng Zhang, Kun Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised semantic segmentation (SSS) is an important task that
+utilizes both labeled and unlabeled data to reduce expenses on labeling
+training examples. However, the effectiveness of SSS algorithms is limited by
+the difficulty of fully exploiting the potential of unlabeled data. To address
+this, we propose a dual-level Siamese structure network (DSSN) for pixel-wise
+contrastive learning. By aligning positive pairs with a pixel-wise contrastive
+loss using strong augmented views in both low-level image space and high-level
+feature space, the proposed DSSN is designed to maximize the utilization of
+available unlabeled data. Additionally, we introduce a novel class-aware
+pseudo-label selection strategy for weak-to-strong supervision, which addresses
+the limitations of most existing methods that do not perform selection or apply
+a predefined threshold for all classes. Specifically, our strategy selects the
+top high-confidence prediction of the weak view for each class to generate
+pseudo labels that supervise the strong augmented views. This strategy is
+capable of taking into account the class imbalance and improving the
+performance of long-tailed classes. Our proposed method achieves
+state-of-the-art results on two datasets, PASCAL VOC 2012 and Cityscapes,
+outperforming other SSS algorithms by a significant margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023 accpeted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ trajdata: A Unified Interface to Multiple Human Trajectory <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boris Ivanovic, Guanyu Song, Igor Gilitschenski, Marco Pavone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of trajectory forecasting has grown significantly in recent years,
+partially owing to the release of numerous large-scale, real-world human
+trajectory datasets for autonomous vehicles (AVs) and pedestrian motion
+tracking. While such datasets have been a boon for the community, they each use
+custom and unique data formats and APIs, making it cumbersome for researchers
+to train and evaluate methods across multiple datasets. To remedy this, we
+present trajdata: a unified interface to multiple human trajectory datasets. At
+its core, trajdata provides a simple, uniform, and efficient representation and
+API for trajectory and map data. As a demonstration of its capabilities, in
+this work we conduct a comprehensive empirical evaluation of existing
+trajectory datasets, providing users with a rich understanding of the data
+underpinning much of current pedestrian and AV motion forecasting research, and
+proposing suggestions for future datasets from these insights. trajdata is
+permissively licensed (Apache 2.0) and can be accessed online at
+https://github.com/NVlabs/trajdata
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 15 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simulation-based Inference for Cardiovascular Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13918v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13918v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antoine Wehenkel, Jens Behrmann, Andrew C. Miller, Guillermo Sapiro, Ozan Sener, Marco Cuturi, Jörn-Henrik Jacobsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past decades, hemodynamics simulators have steadily evolved and have
+become tools of choice for studying cardiovascular systems in-silico. While
+such tools are routinely used to simulate whole-body hemodynamics from
+physiological parameters, solving the corresponding inverse problem of mapping
+waveforms back to plausible physiological parameters remains both promising and
+challenging. Motivated by advances in simulation-based inference (SBI), we cast
+this inverse problem as statistical inference. In contrast to alternative
+approaches, SBI provides \textit{posterior distributions} for the parameters of
+interest, providing a \textit{multi-dimensional} representation of uncertainty
+for \textit{individual} measurements. We showcase this ability by performing an
+in-silico uncertainty analysis of five biomarkers of clinical interest
+comparing several measurement modalities. Beyond the corroboration of known
+facts, such as the feasibility of estimating heart rate, our study highlights
+the potential of estimating new biomarkers from standard-of-care measurements.
+SBI reveals practically relevant findings that cannot be captured by standard
+sensitivity analyses, such as the existence of sub-populations for which
+parameter estimation exhibits distinct uncertainty regimes. Finally, we study
+the gap between in-vivo and in-silico with the MIMIC-III waveform database and
+critically discuss how cardiovascular simulations can inform real-world data
+analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BayesDAG: Gradient-Based Posterior Sampling for Causal Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yashas Annadani, Nick Pawlowski, Joel Jennings, Stefan Bauer, Cheng Zhang, Wenbo Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian causal discovery aims to infer the posterior distribution over
+causal models from observed data, quantifying epistemic uncertainty and
+benefiting downstream tasks. However, computational challenges arise due to
+joint inference over combinatorial space of Directed Acyclic Graphs (DAGs) and
+nonlinear functions. Despite recent progress towards efficient posterior
+inference over DAGs, existing methods are either limited to variational
+inference on node permutation matrices for linear causal models, leading to
+compromised inference accuracy, or continuous relaxation of adjacency matrices
+constrained by a DAG regularizer, which cannot ensure resulting graphs are
+DAGs. In this work, we introduce a scalable Bayesian causal discovery framework
+based on stochastic gradient Markov Chain Monte Carlo (SG-MCMC) that overcomes
+these limitations. Our approach directly samples DAGs from the posterior
+without requiring any DAG regularization, simultaneously draws function
+parameter samples and is applicable to both linear and nonlinear causal models.
+To enable our approach, we derive a novel equivalence to the permutation-based
+DAG learning, which opens up possibilities of using any relaxed gradient
+estimator defined over permutations. To our knowledge, this is the first
+framework applying gradient-based MCMC sampling for causal discovery. Empirical
+evaluations on synthetic and real-world datasets demonstrate our approach's
+effectiveness compared to state-of-the-art baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online learning in bandits with predicted context 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13916v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13916v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongyi Guo, Susan Murphy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the contextual bandit problem where at each time, the agent only
+has access to a noisy version of the context and the error variance (or an
+estimator of this variance). This setting is motivated by a wide range of
+applications where the true context for decision-making is unobserved, and only
+a prediction of the context by a potentially complex machine learning algorithm
+is available. When the context error is non-diminishing, classical bandit
+algorithms fail to achieve sublinear regret. We propose the first online
+algorithm in this setting with sublinear regret compared to the appropriate
+benchmark. The key idea is to extend the measurement error model in classical
+statistics to the online decision-making setting, which is nontrivial due to
+the policy being dependent on the noisy context observations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Neural Networks-based Hybrid Framework For Predicting Particle
+  Crushing Strength 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13909v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13909v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tongya Zheng, Tianli Zhang, Qingzheng Guan, Wenjie Huang, Zunlei Feng, Mingli Song, Chun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks have emerged as an effective machine learning tool for
+multi-disciplinary tasks such as pharmaceutical molecule classification and
+chemical reaction prediction, because they can model non-euclidean
+relationships between different entities. Particle crushing, as a significant
+field of civil engineering, describes the breakage of granular materials caused
+by the breakage of particle fragment bonds under the modeling of numerical
+simulations, which motivates us to characterize the mechanical behaviors of
+particle crushing through the connectivity of particle fragments with Graph
+Neural Networks (GNNs). However, there lacks an open-source large-scale
+particle crushing dataset for research due to the expensive costs of laboratory
+tests or numerical simulations. Therefore, we firstly generate a dataset with
+45,000 numerical simulations and 900 particle types to facilitate the research
+progress of machine learning for particle crushing. Secondly, we devise a
+hybrid framework based on GNNs to predict particle crushing strength in a
+particle fragment view with the advances of state of the art GNNs. Finally, we
+compare our hybrid framework against traditional machine learning methods and
+the plain MLP to verify its effectiveness. The usefulness of different features
+is further discussed through the gradient attribution explanation w.r.t the
+predictions. Our data and code are released at
+https://github.com/doujiang-zheng/GNN-For-Particle-Crushing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robustness Verification of Deep Neural Networks using Star-Based
+  Reachability Analysis with Variable-Length Time Series Input 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neelanjana Pal, Diego Manzanas Lopez, Taylor T Johnson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven, neural network (NN) based anomaly detection and predictive
+maintenance are emerging research areas. NN-based analytics of time-series data
+offer valuable insights into past behaviors and estimates of critical
+parameters like remaining useful life (RUL) of equipment and state-of-charge
+(SOC) of batteries. However, input time series data can be exposed to
+intentional or unintentional noise when passing through sensors, necessitating
+robust validation and verification of these NNs. This paper presents a case
+study of the robustness verification approach for time series regression NNs
+(TSRegNN) using set-based formal methods. It focuses on utilizing
+variable-length input data to streamline input manipulation and enhance network
+architecture generalizability. The method is applied to two data sets in the
+Prognostics and Health Management (PHM) application areas: (1) SOC estimation
+of a Lithium-ion battery and (2) RUL estimation of a turbine engine. The NNs'
+robustness is checked using star-based reachability analysis, and several
+performance measures evaluate the effect of bounded perturbations in the input
+on network outputs, i.e., future outcomes. Overall, the paper offers a
+comprehensive case study for validating and verifying NN-based analytics of
+time-series data in real-world applications, emphasizing the importance of
+robustness testing for accurate and reliable predictions, especially
+considering the impact of noise on future outcomes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review, 26 Pages, 14 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Corruption-Robust Lipschitz Contextual Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiliang Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  I study the problem of learning a Lipschitz function with corrupted binary
+signals. The learner tries to learn a Lipschitz function $f$ that the adversary
+chooses. In each round, the adversary selects a context vector $x_t$ in the
+input space, and the learner makes a guess to the true function value $f(x_t)$
+and receives a binary signal indicating whether the guess was high or low. In a
+total of $C$ rounds, the signal may be corrupted, though the value of $C$ is
+unknown to the learner. The learner's goal is to incur a small cumulative loss.
+I present a natural yet powerful technique sanity check, which proves useful in
+designing corruption-robust algorithms. I design algorithms which (treating the
+Lipschitz parameter $L$ as constant): for the symmetric loss, the learner
+achieves regret $O(C\log T)$ with $d = 1$ and $O_d(C\log T + T^{(d-1)/d})$ with
+$d > 1$; for the pricing loss the learner achieves regret $\widetilde{O}
+(T^{d/(d+1)} + C\cdot T^{1/(d+1)})$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Regularizing Neural Networks with Meta-Learning Generative Models <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13899v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13899v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shin'ya Yamaguchi, Daiki Chijiwa, Sekitoshi Kanai, Atsutoshi Kumagai, Hisashi Kashima
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates methods for improving generative data augmentation
+for deep learning. Generative data augmentation leverages the synthetic samples
+produced by generative models as an additional dataset for classification with
+small dataset settings. A key challenge of generative data augmentation is that
+the synthetic data contain uninformative samples that degrade accuracy. This is
+because the synthetic samples do not perfectly represent class categories in
+real data and uniform sampling does not necessarily provide useful samples for
+tasks. In this paper, we present a novel strategy for generative data
+augmentation called meta generative regularization (MGR). To avoid the
+degradation of generative data augmentation, MGR utilizes synthetic samples in
+the regularization term for feature extractors instead of in the loss function,
+e.g., cross-entropy. These synthetic samples are dynamically determined to
+minimize the validation losses through meta-learning. We observed that MGR can
+avoid the performance degradation of na\"ive generative data augmentation and
+boost the baselines. Experiments on six datasets showed that MGR is effective
+particularly when datasets are smaller and stably outperforms baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Data-centric Machine Learning Research (DMLR) Workshop at
+  ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Estimation of the Local Robustness of Machine Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tessa Han, Suraj Srinivas, Himabindu Lakkaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models often need to be robust to noisy input data. The
+effect of real-world noise (which is often random) on model predictions is
+captured by a model's local robustness, i.e., the consistency of model
+predictions in a local region around an input. However, the na\"ive approach to
+computing local robustness based on Monte-Carlo sampling is statistically
+inefficient, leading to prohibitive computational costs for large-scale
+applications. In this work, we develop the first analytical estimators to
+efficiently compute local robustness of multi-class discriminative models using
+local linear function approximation and the multivariate Normal CDF. Through
+the derivation of these estimators, we show how local robustness is connected
+to concepts such as randomized smoothing and softmax probability. We also
+confirm empirically that these estimators accurately and efficiently compute
+the local robustness of standard deep learning models. In addition, we
+demonstrate these estimators' usefulness for various tasks involving local
+robustness, such as measuring robustness bias and identifying examples that are
+vulnerable to noise perturbation in a dataset. By developing these analytical
+estimators, this work not only advances conceptual understanding of local
+robustness, but also makes its computation practical, enabling the use of local
+robustness in critical downstream applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ExeDec: Execution Decomposition for Compositional Generalization in
+  Neural Program Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13883v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13883v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kensen Shi, Joey Hong, Manzil Zaheer, Pengcheng Yin, Charles Sutton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When writing programs, people have the ability to tackle a new complex task
+by decomposing it into smaller and more familiar subtasks. While it is
+difficult to measure whether neural program synthesis methods have similar
+capabilities, we can measure whether they compositionally generalize, that is,
+whether a model that has been trained on the simpler subtasks is subsequently
+able to solve more complex tasks. In this paper, we characterize several
+different forms of compositional generalization that are desirable in program
+synthesis, forming a meta-benchmark which we use to create generalization tasks
+for two popular datasets, RobustFill and DeepCoder. We then propose ExeDec, a
+novel decomposition-based synthesis strategy that predicts execution subgoals
+to solve problems step-by-step informed by program execution at each step.
+ExeDec has better synthesis performance and greatly improved compositional
+generalization ability compared to baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2204.03758</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Good Lattice Training: Physics-Informed Neural Networks Accelerated by
+  Number Theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takashi Matsubara, Takaharu Yaguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks (PINNs) offer a novel and efficient approach
+to solving partial differential equations (PDEs). Their success lies in the
+physics-informed loss, which trains a neural network to satisfy a given PDE at
+specific points and to approximate the solution. However, the solutions to PDEs
+are inherently infinite-dimensional, and the distance between the output and
+the solution is defined by an integral over the domain. Therefore, the
+physics-informed loss only provides a finite approximation, and selecting
+appropriate collocation points becomes crucial to suppress the discretization
+errors, although this aspect has often been overlooked. In this paper, we
+propose a new technique called good lattice training (GLT) for PINNs, inspired
+by number theoretic methods for numerical analysis. GLT offers a set of
+collocation points that are effective even with a small number of points and
+for multi-dimensional spaces. Our experiments demonstrate that GLT requires
+2--20 times fewer collocation points (resulting in lower computational cost)
+than uniformly random sampling or Latin hypercube sampling, while achieving
+competitive performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning sources of variability from high-dimensional observational
+  studies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13868v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13868v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric W. Bridgeford, Jaewon Chung, Brian Gilbert, Sambit Panda, Adam Li, Cencheng Shen, Alexandra Badea, Brian Caffo, Joshua T. Vogelstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal inference studies whether the presence of a variable influences an
+observed outcome. As measured by quantities such as the "average treatment
+effect," this paradigm is employed across numerous biological fields, from
+vaccine and drug development to policy interventions. Unfortunately, the
+majority of these methods are often limited to univariate outcomes. Our work
+generalizes causal estimands to outcomes with any number of dimensions or any
+measurable space, and formulates traditional causal estimands for nominal
+variables as causal discrepancy tests. We propose a simple technique for
+adjusting universally consistent conditional independence tests and prove that
+these tests are universally consistent causal discrepancy tests. Numerical
+experiments illustrate that our method, Causal CDcorr, leads to improvements in
+both finite sample validity and power when compared to existing strategies. Our
+methods are all open source and available at github.com/ebridge2/cdcorr.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Controlling the Inductive Bias of Wide Neural Networks by Modifying the
+  Kernel's Spectrum 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14531v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14531v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amnon Geifman, Daniel Barzilai, Ronen Basri, Meirav Galun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wide neural networks are biased towards learning certain functions,
+influencing both the rate of convergence of gradient descent (GD) and the
+functions that are reachable with GD in finite training time. As such, there is
+a great need for methods that can modify this bias according to the task at
+hand. To that end, we introduce Modified Spectrum Kernels (MSKs), a novel
+family of constructed kernels that can be used to approximate kernels with
+desired eigenvalues for which no closed form is known. We leverage the duality
+between wide neural networks and Neural Tangent Kernels and propose a
+preconditioned gradient descent method, which alters the trajectory of GD. As a
+result, this allows for a polynomial and, in some cases, exponential training
+speedup without changing the final solution. Our method is both computationally
+efficient and simple to implement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Estimation in Mixed-Membership Stochastic Block Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14530v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14530v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fedor Noskov, Maxim Panov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Community detection is one of the most critical problems in modern network
+science. Its applications can be found in various fields, from protein modeling
+to social network analysis. Recently, many papers appeared studying the problem
+of overlapping community detection, where each node of a network may belong to
+several communities. In this work, we consider Mixed-Membership Stochastic
+Block Model (MMSB) first proposed by Airoldi et al. (2008). MMSB provides quite
+a general setting for modeling overlapping community structure in graphs. The
+central question of this paper is to reconstruct relations between communities
+given an observed network. We compare different approaches and establish the
+minimax lower bound on the estimation error. Then, we propose a new estimator
+that matches this lower bound. Theoretical results are proved under fairly
+general conditions on the considered model. Finally, we illustrate the theory
+in a series of experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Function Value Learning: Adaptive Learning Rates Based on the Polyak
+  Stepsize and Function Splitting in ERM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14528v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14528v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillaume Garrigos, Robert M. Gower, Fabian Schaipp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Here we develop variants of SGD (stochastic gradient descent) with an
+adaptive step size that make use of the sampled loss values. In particular, we
+focus on solving a finite sum-of-terms problem, also known as empirical risk
+minimization. We first detail an idealized adaptive method called
+$\texttt{SPS}_+$ that makes use of the sampled loss values and assumes
+knowledge of the sampled loss at optimality. This $\texttt{SPS}_+$ is a minor
+modification of the SPS (Stochastic Polyak Stepsize) method, where the step
+size is enforced to be positive. We then show that $\texttt{SPS}_+$ achieves
+the best known rates of convergence for SGD in the Lipschitz non-smooth. We
+then move onto to develop $\texttt{FUVAL}$, a variant of $\texttt{SPS}_+$ where
+the loss values at optimality are gradually learned, as opposed to being given.
+We give three viewpoints of $\texttt{FUVAL}$, as a projection based method, as
+a variant of the prox-linear method, and then as a particular online SGD
+method. We then present a convergence analysis of $\texttt{FUVAL}$ and
+experimental results. The shortcomings of our work is that the convergence
+analysis of $\texttt{FUVAL}$ shows no advantage over SGD. Another shortcomming
+is that currently only the full batch version of $\texttt{FUVAL}$ shows a minor
+advantages of GD (Gradient Descent) in terms of sensitivity to the step size.
+The stochastic version shows no clear advantage over SGD. We conjecture that
+large mini-batches are required to make $\texttt{FUVAL}$ competitive.
+  Currently the new $\texttt{FUVAL}$ method studied in this paper does not
+offer any clear theoretical or practical advantage. We have chosen to make this
+draft available online nonetheless because of some of the analysis techniques
+we use, such as the non-smooth analysis of $\texttt{SPS}_+$, and also to show
+an apparently interesting approach that currently does not work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open Problems in Computer Vision for Wilderness SAR and The Search for
+  Patricia Wu-Murad 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14527v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14527v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Manzini, Robin Murphy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper details the challenges in applying two computer vision systems, an
+EfficientDET supervised learning model and the unsupervised RX spectral
+classifier, to 98.9 GB of drone imagery from the Wu-Murad wilderness search and
+rescue (WSAR) effort in Japan and identifies 3 directions for future research.
+There have been at least 19 proposed approaches and 3 datasets aimed at
+locating missing persons in drone imagery, but only 3 approaches (2
+unsupervised and 1 of an unknown structure) are referenced in the literature as
+having been used in an actual WSAR operation. Of these proposed approaches, the
+EfficientDET architecture and the unsupervised spectral RX classifier were
+selected as the most appropriate for this setting. The EfficientDET model was
+applied to the HERIDAL dataset and despite achieving performance that is
+statistically equivalent to the state-of-the-art, the model fails to translate
+to the real world in terms of false positives (e.g., identifying tree limbs and
+rocks as people), and false negatives (e.g., failing to identify members of the
+search team). The poor results in practice for algorithms that showed good
+results on datasets suggest 3 areas of future research: more realistic datasets
+for wilderness SAR, computer vision models that are capable of seamlessly
+handling the variety of imagery that can be collected during actual WSAR
+operations, and better alignment on performance measures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bug Characterization in Machine Learning-based Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14512v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14512v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Mehdi Morovati, Amin Nikanjam, Florian Tambon, Foutse Khomh, Zhen Ming,  Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rapid growth of applying Machine Learning (ML) in different domains,
+especially in safety-critical areas, increases the need for reliable ML
+components, i.e., a software component operating based on ML. Understanding the
+bugs characteristics and maintenance challenges in ML-based systems can help
+developers of these systems to identify where to focus maintenance and testing
+efforts, by giving insights into the most error-prone components, most common
+bugs, etc. In this paper, we investigate the characteristics of bugs in
+ML-based software systems and the difference between ML and non-ML bugs from
+the maintenance viewpoint. We extracted 447,948 GitHub repositories that used
+one of the three most popular ML frameworks, i.e., TensorFlow, Keras, and
+PyTorch. After multiple filtering steps, we select the top 300 repositories
+with the highest number of closed issues. We manually investigate the extracted
+repositories to exclude non-ML-based systems. Our investigation involved a
+manual inspection of 386 sampled reported issues in the identified ML-based
+systems to indicate whether they affect ML components or not. Our analysis
+shows that nearly half of the real issues reported in ML-based systems are ML
+bugs, indicating that ML components are more error-prone than non-ML
+components. Next, we thoroughly examined 109 identified ML bugs to identify
+their root causes, symptoms, and calculate their required fixing time. The
+results also revealed that ML bugs have significantly different characteristics
+compared to non-ML bugs, in terms of the complexity of bug-fixing (number of
+commits, changed files, and changed lines of code). Based on our results,
+fixing ML bugs are more costly and ML components are more error-prone, compared
+to non-ML bugs and non-ML components respectively. Hence, paying a significant
+attention to the reliability of the ML components is crucial in ML-based
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Predictive Model of Digital Information Engagement: Forecasting User
+  Engagement With English Words by Incorporating Cognitive Biases,
+  Computational Linguistics and Natural Language Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14500v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14500v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nimrod Dvir, Elaine Friedman, Suraj Commuri, Fan yang, Jennifer Romano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces and empirically tests a novel predictive model for
+digital information engagement (IE) - the READ model, an acronym for the four
+pivotal attributes of engaging information: Representativeness, Ease-of-use,
+Affect, and Distribution. Conceptualized within the theoretical framework of
+Cumulative Prospect Theory, the model integrates key cognitive biases with
+computational linguistics and natural language processing to develop a
+multidimensional perspective on information engagement. A rigorous testing
+protocol was implemented, involving 50 randomly selected pairs of synonymous
+words (100 words in total) from the WordNet database. These words' engagement
+levels were evaluated through a large-scale online survey (n = 80,500) to
+derive empirical IE metrics. The READ attributes for each word were then
+computed and their predictive efficacy examined. The findings affirm the READ
+model's robustness, accurately predicting a word's IE level and distinguishing
+the more engaging word from a pair of synonyms with an 84% accuracy rate. The
+READ model's potential extends across various domains, including business,
+education, government, and healthcare, where it could enhance content
+engagement and inform AI language model development and generative text work.
+Future research should address the model's scalability and adaptability across
+different domains and languages, thereby broadening its applicability and
+efficacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HUGE: Huge Unsupervised Graph Embeddings with TPUs <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14490v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14490v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brandon Mayer, Anton Tsitsulin, Hendrik Fichtenberger, Jonathan Halcrow, Bryan Perozzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graphs are a representation of structured data that captures the
+relationships between sets of objects. With the ubiquity of available network
+data, there is increasing industrial and academic need to quickly analyze
+graphs with billions of nodes and trillions of edges. A common first step for
+network understanding is Graph Embedding, the process of creating a continuous
+representation of nodes in a graph. A continuous representation is often more
+amenable, especially at scale, for solving downstream machine learning tasks
+such as classification, link prediction, and clustering. A high-performance
+graph embedding architecture leveraging Tensor Processing Units (TPUs) with
+configurable amounts of high-bandwidth memory is presented that simplifies the
+graph embedding problem and can scale to graphs with billions of nodes and
+trillions of edges. We verify the embedding space quality on real and synthetic
+large-scale datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>As appeared at KDD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Role of Image Acquisition and Patient Phenotype Variations in Automatic
+  Segmentation Model Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14482v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14482v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timothy L. Kline, Sumana Ramanathan, Harrison C. Gottlich, Panagiotis Korfiatis, Adriana V. Gregory
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: This study evaluated the out-of-domain performance and
+generalization capabilities of automated medical image segmentation models,
+with a particular focus on adaptation to new image acquisitions and disease
+type.
+  Materials: Datasets from both non-contrast and contrast-enhanced abdominal CT
+scans of healthy patients and those with polycystic kidney disease (PKD) were
+used. A total of 400 images (100 non-contrast controls, 100 contrast controls,
+100 non-contrast PKD, 100 contrast PKD) were utilized for training/validation
+of models to segment kidneys, livers, and spleens, and the final models were
+then tested on 100 non-contrast CT images of patients affected by PKD.
+Performance was evaluated using Dice, Jaccard, TPR, and Precision.
+  Results: Models trained on a diverse range of data showed no worse
+performance than models trained exclusively on in-domain data when tested on
+in-domain data. For instance, the Dice similarity of the model trained on 25%
+from each dataset was found to be non-inferior to the model trained purely on
+in-domain data.
+  Conclusions: The results indicate that broader training examples
+significantly enhances model generalization and out-of-domain performance,
+thereby improving automated segmentation tools' applicability in clinical
+settings. The study's findings provide a roadmap for future research to adopt a
+data-centric approach in medical image AI model development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Limits to Reservoir Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14474v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14474v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anthony M. Polloreno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we bound a machine's ability to learn based on computational
+limitations implied by physicality. We start by considering the information
+processing capacity (IPC), a normalized measure of the expected squared error
+of a collection of signals to a complete basis of functions. We use the IPC to
+measure the degradation under noise of the performance of reservoir computers,
+a particular kind of recurrent network, when constrained by physical
+considerations. First, we show that the IPC is at most a polynomial in the
+system size $n$, even when considering the collection of $2^n$ possible
+pointwise products of the $n$ output signals. Next, we argue that this
+degradation implies that the family of functions represented by the reservoir
+requires an exponential number of samples to learn in the presence of the
+reservoir's noise. Finally, we conclude with a discussion of the performance of
+the same collection of $2^n$ functions without noise when being used for binary
+classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What Kinds of Contracts Do ML APIs Need? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14465v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14465v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samantha Syeda Khairunnesa, Shibbir Ahmed, Sayem Mohammad Imtiaz, Hridesh Rajan, Gary T. Leavens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown that Machine Learning (ML) programs are error-prone and
+called for contracts for ML code. Contracts, as in the design by contract
+methodology, help document APIs and aid API users in writing correct code. The
+question is: what kinds of contracts would provide the most help to API users?
+We are especially interested in what kinds of contracts help API users catch
+errors at earlier stages in the ML pipeline. We describe an empirical study of
+posts on Stack Overflow of the four most often-discussed ML libraries:
+TensorFlow, Scikit-learn, Keras, and PyTorch. For these libraries, our study
+extracted 413 informal (English) API specifications. We used these
+specifications to understand the following questions. What are the root causes
+and effects behind ML contract violations? Are there common patterns of ML
+contract violations? When does understanding ML contracts require an advanced
+level of ML software expertise? Could checking contracts at the API level help
+detect the violations in early ML pipeline stages? Our key findings are that
+the most commonly needed contracts for ML APIs are either checking constraints
+on single arguments of an API or on the order of API calls. The software
+engineering community could employ existing contract mining approaches to mine
+these contracts to promote an increased understanding of ML APIs. We also noted
+a need to combine behavioral and temporal contract mining approaches. We report
+on categories of required ML contracts, which may help designers of contract
+languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at EMSE (Empirical Software Engineering)
+  Journal, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training Quantum Boltzmann Machines with Coresets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14459v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14459v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Viszlai, Teague Tomesh, Pranav Gokhale, Eric Anschuetz, Frederic T. Chong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has proposed and explored using coreset techniques for quantum
+algorithms that operate on classical data sets to accelerate the applicability
+of these algorithms on near-term quantum devices. We apply these ideas to
+Quantum Boltzmann Machines (QBM) where gradient-based steps which require Gibbs
+state sampling are the main computational bottleneck during training. By using
+a coreset in place of the full data set, we try to minimize the number of steps
+needed and accelerate the overall training time. In a regime where
+computational time on quantum computers is a precious resource, we propose this
+might lead to substantial practical savings. We evaluate this approach on 6x6
+binary images from an augmented bars and stripes data set using a QBM with 36
+visible units and 8 hidden units. Using an Inception score inspired metric, we
+compare QBM training times with and without using coresets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Appeared in IEEE International Conference on Quantum Computing and
+  Engineering (QCE22) in September 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predictive Maintenance of Armoured Vehicles using Machine Learning
+  Approaches <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14453v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14453v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prajit Sengupta, Anant Mehta, Prashant Singh Rana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Armoured vehicles are specialized and complex pieces of machinery designed to
+operate in high-stress environments, often in combat or tactical situations.
+This study proposes a predictive maintenance-based ensemble system that aids in
+predicting potential maintenance needs based on sensor data collected from
+these vehicles. The proposed model's architecture involves various models such
+as Light Gradient Boosting, Random Forest, Decision Tree, Extra Tree Classifier
+and Gradient Boosting to predict the maintenance requirements of the vehicles
+accurately. In addition, K-fold cross validation, along with TOPSIS analysis,
+is employed to evaluate the proposed ensemble model's stability. The results
+indicate that the proposed system achieves an accuracy of 98.93%, precision of
+99.80% and recall of 99.03%. The algorithm can effectively predict maintenance
+needs, thereby reducing vehicle downtime and improving operational efficiency.
+Through comparisons between various algorithms and the suggested ensemble, this
+study highlights the potential of machine learning-based predictive maintenance
+solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Conference Proceedings of INTERNATIONAL CONFERENCE ON COMPUTER
+  SCIENCE, MACHINE LEARNING AND ARTIFICIAL INTELLIGENCE (pg:25-31) - (New
+  Delhi, 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VISPUR: Visual Aids for Identifying and Interpreting Spurious
+  Associations in Data-Driven Decisions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14448v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14448v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xian Teng, Yongsu Ahn, Yu-Ru Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Big data and machine learning tools have jointly empowered humans in making
+data-driven decisions. However, many of them capture empirical associations
+that might be spurious due to confounding factors and subgroup heterogeneity.
+The famous Simpson's paradox is such a phenomenon where aggregated and
+subgroup-level associations contradict with each other, causing cognitive
+confusions and difficulty in making adequate interpretations and decisions.
+Existing tools provide little insights for humans to locate, reason about, and
+prevent pitfalls of spurious association in practice. We propose VISPUR, a
+visual analytic system that provides a causal analysis framework and a
+human-centric workflow for tackling spurious associations. These include a
+CONFOUNDER DASHBOARD, which can automatically identify possible confounding
+factors, and a SUBGROUP VIEWER, which allows for the visualization and
+comparison of diverse subgroup patterns that likely or potentially result in a
+misinterpretation of causality. Additionally, we propose a REASONING
+STORYBOARD, which uses a flow-based approach to illustrate paradoxical
+phenomena, as well as an interactive DECISION DIAGNOSIS panel that helps ensure
+accountable decision-making. Through an expert interview and a controlled user
+experiment, our qualitative and quantitative results demonstrate that the
+proposed "de-paradox" workflow and the designed visual analytic system are
+effective in helping human users to identify and understand spurious
+associations, as well as to make accountable causal decisions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fixed Integral Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14439v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14439v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryan Kortvelesy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is often useful to perform integration over learned functions represented
+by neural networks. However, this integration is usually performed numerically,
+as analytical integration over learned functions (especially neural networks)
+is generally viewed as intractable. In this work, we present a method for
+representing the analytical integral of a learned function $f$. This allows the
+exact integral of a neural network to be computed, and enables constrained
+neural networks to be parametrised by applying constraints directly to the
+integral. Crucially, we also introduce a method to constrain $f$ to be
+positive, a necessary condition for many applications (e.g. probability
+distributions, distance metrics, etc). Finally, we introduce several
+applications where our fixed-integral neural network (FINN) can be utilised.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Deep Learning-based Pansharpening with Jointly-Enhanced
+  Spectral and Spatial Fidelity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Ciotola, Giovanni Poggi, Giuseppe Scarpa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In latest years, deep learning has gained a leading role in the pansharpening
+of multiresolution images. Given the lack of ground truth data, most deep
+learning-based methods carry out supervised training in a reduced-resolution
+domain. However, models trained on downsized images tend to perform poorly on
+high-resolution target images. For this reason, several research groups are now
+turning to unsupervised training in the full-resolution domain, through the
+definition of appropriate loss functions and training paradigms. In this
+context, we have recently proposed a full-resolution training framework which
+can be applied to many existing architectures.
+  Here, we propose a new deep learning-based pansharpening model that fully
+exploits the potential of this approach and provides cutting-edge performance.
+Besides architectural improvements with respect to previous work, such as the
+use of residual attention modules, the proposed model features a novel loss
+function that jointly promotes the spectral and spatial quality of the
+pansharpened data. In addition, thanks to a new fine-tuning strategy, it
+improves inference-time adaptation to target images. Experiments on a large
+variety of test images, performed in challenging scenarios, demonstrate that
+the proposed method compares favorably with the state of the art both in terms
+of numerical results and visual output. Code is available online at
+https://github.com/matciotola/Lambda-PNN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TreeFlow: Going beyond Tree-based Gaussian Probabilistic Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.04140v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.04140v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patryk Wielopolski, Maciej Zięba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The tree-based ensembles are known for their outstanding performance in
+classification and regression problems characterized by feature vectors
+represented by mixed-type variables from various ranges and domains. However,
+considering regression problems, they are primarily designed to provide
+deterministic responses or model the uncertainty of the output with Gaussian or
+parametric distribution. In this work, we introduce TreeFlow, the tree-based
+approach that combines the benefits of using tree ensembles with the
+capabilities of modeling flexible probability distributions using normalizing
+flows. The main idea of the solution is to use a tree-based model as a feature
+extractor and combine it with a conditional variant of normalizing flow.
+Consequently, our approach is capable of modeling complex distributions for the
+regression outputs. We evaluate the proposed method on challenging regression
+benchmarks with varying volume, feature characteristics, and target
+dimensionality. We obtain the SOTA results for both probabilistic and
+deterministic metrics on datasets with multi-modal target distributions and
+competitive results on unimodal ones compared to tree-based regression
+baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal Manoeuvre and Trajectory Prediction for Automated Driving on
+  Highways Using <span class="highlight-title">Transformer</span> Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16109v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16109v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sajjad Mozaffari, Mreza Alipour Sormoli, Konstantinos Koufos, Mehrdad Dianati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting the behaviour (i.e., manoeuvre/trajectory) of other road users,
+including vehicles, is critical for the safe and efficient operation of
+autonomous vehicles (AVs), a.k.a., automated driving systems (ADSs). Due to the
+uncertain future behaviour of vehicles, multiple future behaviour modes are
+often plausible for a vehicle in a given driving scene. Therefore, multimodal
+prediction can provide richer information than single-mode prediction, enabling
+AVs to perform a better risk assessment. To this end, we propose a novel
+multimodal prediction framework that can predict multiple plausible behaviour
+modes and their likelihoods. The proposed framework includes a bespoke problem
+formulation for manoeuvre prediction, a novel transformer-based prediction
+model, and a tailored training method for multimodal manoeuvre and trajectory
+prediction. The performance of the framework is evaluated using three public
+highway driving datasets, namely NGSIM, highD, and exiD. The results show that
+our framework outperforms the state-of-the-art multimodal methods in terms of
+prediction error and is capable of predicting plausible manoeuvre and
+trajectory modes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures, submitted to IEEE RAL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An optimal control perspective on diffusion-based generative modeling <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.01364v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.01364v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julius Berner, Lorenz Richter, Karen Ullrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We establish a connection between stochastic optimal control and generative
+models based on stochastic differential equations (SDEs), such as recently
+developed diffusion probabilistic models. In particular, we derive a
+Hamilton-Jacobi-Bellman equation that governs the evolution of the
+log-densities of the underlying SDE marginals. This perspective allows to
+transfer methods from optimal control theory to generative modeling. First, we
+show that the evidence lower bound is a direct consequence of the well-known
+verification theorem from control theory. Further, we can formulate
+diffusion-based generative modeling as a minimization of the Kullback-Leibler
+divergence between suitable measures in path space. Finally, we develop a novel
+diffusion-based method for sampling from unnormalized densities -- a problem
+frequently occurring in statistics and computational sciences. We demonstrate
+that our time-reversed diffusion sampler (DIS) can outperform other
+diffusion-based sampling approaches on multiple numerical examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for oral presentation at NeurIPS 2022 Workshop on
+  Score-Based Methods</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uniformity Testing over Hypergrids with Subcube Conditioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09013v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09013v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Chen, Cassandra Marcussen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We give an algorithm for testing uniformity of distributions supported on
+hypergrids $[m_1] \times \cdots \times [m_n]$, which makes
+$\smash{\widetilde{O}(\text{poly}(m)\sqrt{n}/\epsilon^2)}$ many queries to a
+subcube conditional sampling oracle with $m=\max_i m_i$. When $m$ is a
+constant, our algorithm is nearly optimal and strengthens the algorithm of
+[CCK+21] which has the same query complexity but works for hypercubes $\{\pm
+1\}^n$ only.
+  A key technical contribution behind the analysis of our algorithm is a proof
+of a robust version of Pisier's inequality for functions over hypergrids using
+Fourier analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended results to the domain [m_1] x ... x [m_n] (previously was
+  [m]^n); substantial revisions to the introduction and conclusion of the paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Early Detection of Bark Beetle Attack Using Remote Sensing and Machine
+  Learning: A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.03829v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.03829v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyed Mojtaba Marvasti-Zadeh, Devin Goodsman, Nilanjan Ray, Nadir Erbilgin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper provides a comprehensive review of past and current advances in
+the early detection of bark beetle-induced tree mortality from three primary
+perspectives: bark beetle & host interactions, RS, and ML/DL. In contrast to
+prior efforts, this review encompasses all RS systems and emphasizes ML/DL
+methods to investigate their strengths and weaknesses. We parse existing
+literature based on multi- or hyper-spectral analyses and distill their
+knowledge based on: bark beetle species & attack phases with a primary emphasis
+on early stages of attacks, host trees, study regions, RS platforms & sensors,
+spectral/spatial/temporal resolutions, spectral signatures, spectral vegetation
+indices (SVIs), ML approaches, learning schemes, task categories, models,
+algorithms, classes/clusters, features, and DL networks & architectures.
+Although DL-based methods and the random forest (RF) algorithm showed promising
+results, highlighting their potential to detect subtle changes across visible,
+thermal, and short-wave infrared (SWIR) spectral regions, they still have
+limited effectiveness and high uncertainties. To inspire novel solutions to
+these shortcomings, we delve into the principal challenges & opportunities from
+different perspectives, enabling a deeper understanding of the current state of
+research and guiding future research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Estimating large causal polytrees from small samples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.07028v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.07028v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sourav Chatterjee, Mathukumalli Vidyasagar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of estimating a large causal polytree from a
+relatively small i.i.d. sample. This is motivated by the problem of determining
+causal structure when the number of variables is very large compared to the
+sample size, such as in gene regulatory networks. We give an algorithm that
+recovers the tree with high accuracy in such settings. The algorithm works
+under essentially no distributional or modeling assumptions other than some
+mild non-degeneracy conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages. The title of the paper has been slightly modified, by
+  removing the word "skeleton". This is because the original version of the
+  paper had an algorithm for recovering only the skeleton, while in this
+  version, we have a way of recovering the directionalities of the arrows as
+  well</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Combining optimal path search with task-dependent learning in a neural
+  network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.11104v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.11104v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomas Kulvicius, Minija Tamosiunaite, Florentin Wörgötter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finding optimal paths in connected graphs requires determining the smallest
+total cost for traveling along the graph's edges. This problem can be solved by
+several classical algorithms where, usually, costs are predefined for all
+edges. Conventional planning methods can, thus, normally not be used when
+wanting to change costs in an adaptive way following the requirements of some
+task. Here we show that one can define a neural network representation of path
+finding problems by transforming cost values into synaptic weights, which
+allows for online weight adaptation using network learning mechanisms. When
+starting with an initial activity value of one, activity propagation in this
+network will lead to solutions, which are identical to those found by the
+Bellman-Ford algorithm. The neural network has the same algorithmic complexity
+as Bellman-Ford and, in addition, we can show that network learning mechanisms
+(such as Hebbian learning) can adapt the weights in the network augmenting the
+resulting paths according to some task at hand. We demonstrate this by learning
+to navigate in an environment with obstacles as well as by learning to follow
+certain sequences of path nodes. Hence, the here-presented novel algorithm may
+open up a different regime of applications where path-augmentation (by
+learning) is directly coupled with path finding in a natural way.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Embeddings for Numerical Features in Tabular Deep Learning <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.05556v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.05556v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yury Gorishniy, Ivan Rubachev, Artem Babenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Transformer-like deep architectures have shown strong performance
+on tabular data problems. Unlike traditional models, e.g., MLP, these
+architectures map scalar values of numerical features to high-dimensional
+embeddings before mixing them in the main backbone. In this work, we argue that
+embeddings for numerical features are an underexplored degree of freedom in
+tabular DL, which allows constructing more powerful DL models and competing
+with GBDT on some traditionally GBDT-friendly benchmarks. We start by
+describing two conceptually different approaches to building embedding modules:
+the first one is based on a piecewise linear encoding of scalar values, and the
+second one utilizes periodic activations. Then, we empirically demonstrate that
+these two approaches can lead to significant performance boosts compared to the
+embeddings based on conventional blocks such as linear layers and ReLU
+activations. Importantly, we also show that embedding numerical features is
+beneficial for many backbones, not only for Transformers. Specifically, after
+proper embeddings, simple MLP-like models can perform on par with the
+attention-based architectures. Overall, we highlight embeddings for numerical
+features as an important design aspect with good potential for further
+improvements in tabular DL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2022 camera-ready. Code:
+  https://github.com/yandex-research/tabular-dl-num-embeddings (v3: minor
+  fixes)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting Deep Learning Models for Tabular Data <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.11959v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.11959v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yury Gorishniy, Ivan Rubachev, Valentin Khrulkov, Artem Babenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The existing literature on deep learning for tabular data proposes a wide
+range of novel architectures and reports competitive results on various
+datasets. However, the proposed models are usually not properly compared to
+each other and existing works often use different benchmarks and experiment
+protocols. As a result, it is unclear for both researchers and practitioners
+what models perform best. Additionally, the field still lacks effective
+baselines, that is, the easy-to-use models that provide competitive performance
+across different problems.
+  In this work, we perform an overview of the main families of DL architectures
+for tabular data and raise the bar of baselines in tabular DL by identifying
+two simple and powerful deep architectures. The first one is a ResNet-like
+architecture which turns out to be a strong baseline that is often missing in
+prior works. The second model is our simple adaptation of the Transformer
+architecture for tabular data, which outperforms other solutions on most tasks.
+Both models are compared to many existing architectures on a diverse set of
+tasks under the same training and tuning protocols. We also compare the best DL
+models with Gradient Boosted Decision Trees and conclude that there is still no
+universally superior solution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2021 camera-ready. Code:
+  https://github.com/yandex-research/tabular-dl-revisiting-models (v3: minor
+  fixes)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Client Selection in Federated Learning: Principles, Challenges, and
+  Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.01549v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.01549v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Fu, Huanle Zhang, Ge Gao, Mi Zhang, Xin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a privacy-preserving paradigm for training Machine Learning (ML) models,
+Federated Learning (FL) has received tremendous attention from both industry
+and academia. In a typical FL scenario, clients exhibit significant
+heterogeneity in terms of data distribution and hardware configurations. Thus,
+randomly sampling clients in each training round may not fully exploit the
+local updates from heterogeneous clients, resulting in lower model accuracy,
+slower convergence rate, degraded fairness, etc. To tackle the FL client
+heterogeneity problem, various client selection algorithms have been developed,
+showing promising performance improvement. In this paper, we systematically
+present recent advances in the emerging field of FL client selection and its
+challenges and research opportunities. We hope to facilitate practitioners in
+choosing the most suitable client selection mechanisms for their applications,
+as well as inspire researchers and newcomers to better understand this exciting
+research topic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Comparison and Calibration Assessment: User Guide for Consistent
+  Scoring Functions in Machine Learning and Actuarial Practice 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.12780v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.12780v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Fissler, Christian Lorentzen, Michael Mayer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the main tasks of actuaries and data scientists is to build good
+predictive models for certain phenomena such as the claim size or the number of
+claims in insurance. These models ideally exploit given feature information to
+enhance the accuracy of prediction. This user guide revisits and clarifies
+statistical techniques to assess the calibration or adequacy of a model on the
+one hand, and to compare and rank different models on the other hand. In doing
+so, it emphasises the importance of specifying the prediction target functional
+at hand a priori (e.g. the mean or a quantile) and of choosing the scoring
+function in model comparison in line with this target functional. Guidance for
+the practical choice of the scoring function is provided. Striving to bridge
+the gap between science and daily practice in application, it focuses mainly on
+the pedagogical presentation of existing results and of best practice. The
+results are accompanied and illustrated by two real data case studies on
+workers' compensation and customer churn.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>70 pages, 22 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TimeTuner: Diagnosing Time Representations for Time-Series Forecasting
+  with Counterfactual Explanations <span class="chip">IEEE VIS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09916v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09916v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianing Hao, Qing Shi, Yilin Ye, Wei Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) approaches are being increasingly used for time-series
+forecasting, with many efforts devoted to designing complex DL models. Recent
+studies have shown that the DL success is often attributed to effective data
+representations, fostering the fields of feature engineering and representation
+learning. However, automated approaches for feature learning are typically
+limited with respect to incorporating prior knowledge, identifying interactions
+among variables, and choosing evaluation metrics to ensure that the models are
+reliable. To improve on these limitations, this paper contributes a novel
+visual analytics framework, namely TimeTuner, designed to help analysts
+understand how model behaviors are associated with localized correlations,
+stationarity, and granularity of time-series representations. The system mainly
+consists of the following two-stage technique: We first leverage counterfactual
+explanations to connect the relationships among time-series representations,
+multivariate features and model predictions. Next, we design multiple
+coordinated views including a partition-based correlation matrix and juxtaposed
+bivariate stripes, and provide a set of interactions that allow users to step
+into the transformation selection process, navigate through the feature space,
+and reason the model performance. We instantiate TimeTuner with two
+transformation methods of smoothing and sampling, and demonstrate its
+applicability on real-world time-series forecasting of univariate sunspots and
+multivariate air pollutants. Feedback from domain experts indicates that our
+system can help characterize time-series representations and guide the feature
+engineering processes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 9 figures, this paper has been accepted by IEEE VIS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Quantity-Aware Aggregation for Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.10848v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.10848v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingwei Yi, Fangzhao Wu, Huishuai Zhang, Bin Zhu, Tao Qi, Guangzhong Sun, Xing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) enables multiple clients to collaboratively train
+models without sharing their local data, and becomes an important
+privacy-preserving machine learning framework. However, classical FL faces
+serious security and robustness problem, e.g., malicious clients can poison
+model updates and at the same time claim large quantities to amplify the impact
+of their model updates in the model aggregation. Existing defense methods for
+FL, while all handling malicious model updates, either treat all quantities
+benign or simply ignore/truncate the quantities of all clients. The former is
+vulnerable to quantity-enhanced attack, while the latter leads to sub-optimal
+performance since the local data on different clients is usually in
+significantly different sizes. In this paper, we propose a robust
+quantity-aware aggregation algorithm for federated learning, called FedRA, to
+perform the aggregation with awareness of local data quantities while being
+able to defend against quantity-enhanced attacks. More specifically, we propose
+a method to filter malicious clients by jointly considering the uploaded model
+updates and data quantities from different clients, and performing
+quantity-aware weighted averaging on model updates from remaining clients.
+Moreover, as the number of malicious clients participating in the federated
+learning may dynamically change in different rounds, we also propose a
+malicious client number estimator to predict how many suspicious clients should
+be filtered in each round. Experiments on four public datasets demonstrate the
+effectiveness of our FedRA method in defending FL against quantity-enhanced
+attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MICDIR: Multi-scale Inverse-consistent Deformable Image Registration
+  using UNetMSS with Self-Constructing Graph Latent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.04317v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.04317v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soumick Chatterjee, Himanshi Bajaj, Istiyak H. Siddiquee, Nandish Bandi Subbarayappa, Steve Simon, Suraj Bangalore Shashidhar, Oliver Speck, Andreas Nürnberge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image registration is the process of bringing different images into a common
+coordinate system - a technique widely used in various applications of computer
+vision, such as remote sensing, image retrieval, and, most commonly, medical
+imaging. Deep learning based techniques have been applied successfully to
+tackle various complex medical image processing problems, including medical
+image registration. Over the years, several image registration techniques have
+been proposed using deep learning. Deformable image registration techniques
+such as Voxelmorph have been successful in capturing finer changes and
+providing smoother deformations. However, Voxelmorph, as well as ICNet and
+FIRE, do not explicitly encode global dependencies (i.e. the overall anatomical
+view of the supplied image) and, therefore, cannot track large deformations. In
+order to tackle the aforementioned problems, this paper extends the Voxelmorph
+approach in three different ways. To improve the performance in case of small
+as well as large deformations, supervision of the model at different
+resolutions has been integrated using a multi-scale UNet. To support the
+network to learn and encode the minute structural co-relations of the given
+image-pairs, a self-constructing graph network (SCGNet) has been used as the
+latent of the multi-scale UNet - which can improve the learning process of the
+model and help the model to generalise better. And finally, to make the
+deformations inverse-consistent, cycle consistency loss has been employed. On
+the task of registration of brain MRIs, the proposed method achieved
+significant improvements over ANTs and VoxelMorph, obtaining a Dice score of
+0.8013 \pm 0.0243 for intramodal and 0.6211 \pm 0.0309 for intermodal, while
+VoxelMorph achieved 0.7747 \pm 0.0260 and 0.6071 \pm 0.0510, respectively
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FacEDiM: A Face Embedding Distribution Model for Few-Shot Biometric
+  Authentication of Cattle <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14831v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14831v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meshia Cédric Oveneke, Rucha Vaishampayan, Deogratias Lukamba Nsadisa, Jenny Ambukiyenyi Onya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work proposes to solve the problem of few-shot biometric authentication
+by computing the Mahalanobis distance between testing embeddings and a
+multivariate Gaussian distribution of training embeddings obtained using
+pre-trained CNNs. Experimental results show that models pre-trained on the
+ImageNet dataset significantly outperform models pre-trained on human faces.
+With a VGG16 model, we obtain a FRR of 1.25% for a FAR of 1.18% on a dataset of
+20 cattle identities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 1 figure, 1 table, paper accepted at Black In AI at the 36th
+  Conference on Neural Information Processing Systems (NeurIPS 2022), New
+  Orleans, USA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Factor Fields: A Unified Framework for Neural Fields and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01226v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01226v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anpei Chen, Zexiang Xu, Xinyue Wei, Siyu Tang, Hao Su, Andreas Geiger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Factor Fields, a novel framework for modeling and representing
+signals. Factor Fields decomposes a signal into a product of factors, each of
+which is represented by a neural or regular field representation operating on a
+coordinate transformed input signal. We show that this decomposition yields a
+unified framework that generalizes several recent signal representations
+including NeRF, PlenOxels, EG3D, Instant-NGP, and TensoRF. Moreover, the
+framework allows for the creation of powerful new signal representations, such
+as the Coefficient-Basis Factorization (CoBaFa) which we propose in this paper.
+As evidenced by our experiments, CoBaFa leads to improvements over previous
+fast reconstruction methods in terms of the three critical goals in neural
+signal representation: approximation quality, compactness and efficiency.
+Experimentally, we demonstrate that our representation achieves better image
+approximation quality on 2D image regression tasks, higher geometric quality
+when reconstructing 3D signed distance fields and higher compactness for
+radiance field reconstruction tasks compared to previous fast reconstruction
+methods. Besides, our CoBaFa representation enables generalization by sharing
+the basis across signals during training, enabling generalization tasks such as
+image regression with sparse observations and few-shot radiance field
+reconstruction. Project Page: https://apchenstu.github.io/FactorFields/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neglected Free Lunch -- Learning Image Classifiers Using Annotation
+  Byproducts <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17595v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17595v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyoon Han, Junsuk Choe, Seonghyeok Chun, John Joon Young Chung, Minsuk Chang, Sangdoo Yun, Jean Y. Song, Seong Joon Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised learning of image classifiers distills human knowledge into a
+parametric model through pairs of images and corresponding labels (X,Y). We
+argue that this simple and widely used representation of human knowledge
+neglects rich auxiliary information from the annotation procedure, such as the
+time-series of mouse traces and clicks left after image selection. Our insight
+is that such annotation byproducts Z provide approximate human attention that
+weakly guides the model to focus on the foreground cues, reducing spurious
+correlations and discouraging shortcut learning. To verify this, we create
+ImageNet-AB and COCO-AB. They are ImageNet and COCO training sets enriched with
+sample-wise annotation byproducts, collected by replicating the respective
+original annotation tasks. We refer to the new paradigm of training models with
+annotation byproducts as learning using annotation byproducts (LUAB). We show
+that a simple multitask loss for regressing Z together with Y already improves
+the generalisability and robustness of the learned models. Compared to the
+original supervised learning, LUAB does not require extra annotation costs.
+ImageNet-AB and COCO-AB are at https://github.com/naver-ai/NeglectedFreeLunch.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code & data at https://github.com/naver-ai/NeglectedFreeLunch. To be
+  presented at ICCV'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Weight Balancing on Long-Tailed Recognition Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16573v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16573v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naoya Hasegawa, Issei Sato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recognition problems in long-tailed data, where the sample size per class is
+heavily skewed, have recently gained importance because the distribution of the
+sample size per class in a dataset is generally exponential unless the sample
+size is intentionally adjusted. Various approaches have been devised to address
+these problems. Recently, weight balancing, which combines well-known classical
+regularization techniques with two-stage training, has been proposed. Despite
+its simplicity, it is known for its high performance against existing methods
+devised in various ways. However, there is a lack of understanding as to why
+this approach is effective for long-tailed data. In this study, we analyze the
+method focusing on neural collapse and cone effect at each training stage and
+find that it can be decomposed into the increase in Fisher's discriminant ratio
+of the feature extractor caused by weight decay and cross entropy loss and
+implicit logit adjustment caused by weight decay and class-balanced loss. Our
+analysis shows that the training method can be further simplified by reducing
+the number of training stages to one while increasing accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond the Edge of Stability via Two-step Gradient Updates <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.04172v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.04172v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Chen, Joan Bruna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient Descent (GD) is a powerful workhorse of modern machine learning
+thanks to its scalability and efficiency in high-dimensional spaces. Its
+ability to find local minimisers is only guaranteed for losses with Lipschitz
+gradients, where it can be seen as a `bona-fide' discretisation of an
+underlying gradient flow. Yet, many ML setups involving overparametrised models
+do not fall into this problem class, which has motivated research beyond the
+so-called ``Edge of Stability'' (EoS), where the step-size crosses the
+admissibility threshold inversely proportional to the Lipschitz constant above.
+Perhaps surprisingly, GD has been empirically observed to still converge
+regardless of local instability and oscillatory behavior.
+  The incipient theoretical analysis of this phenomena has mainly focused in
+the overparametrised regime, where the effect of choosing a large learning rate
+may be associated to a `Sharpness-Minimisation' implicit regularisation within
+the manifold of minimisers, under appropriate asymptotic limits. In contrast,
+in this work we directly examine the conditions for such unstable convergence,
+focusing on simple, yet representative, learning problems, via analysis of
+two-step gradient updates. Specifically, we characterize a local condition
+involving third-order derivatives that guarantees existence and convergence to
+fixed points of the two-step updates, and leverage such property in a
+teacher-student setting, under population loss. Finally, starting from Matrix
+Factorization, we provide observations of period-2 orbit of GD in
+high-dimensional settings with intuition of its dynamics, along with
+exploration into more general settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2023. Update: more discussions on Matrix
+  Factorization</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ No Train No Gain: Revisiting Efficient Training Algorithms For
+  <span class="highlight-title">Transformer</span>-based Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06440v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06440v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean Kaddour, Oscar Key, Piotr Nawrot, Pasquale Minervini, Matt J. Kusner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The computation necessary for training Transformer-based language models has
+skyrocketed in recent years. This trend has motivated research on efficient
+training algorithms designed to improve training, validation, and downstream
+performance faster than standard training. In this work, we revisit three
+categories of such algorithms: dynamic architectures (layer stacking, layer
+dropping), batch selection (selective backprop, RHO loss), and efficient
+optimizers (Lion, Sophia). When pre-training BERT and T5 with a fixed
+computation budget using such methods, we find that their training, validation,
+and downstream gains vanish compared to a baseline with a fully-decayed
+learning rate. We define an evaluation protocol that enables computation to be
+done on arbitrary machines by mapping all computation time to a reference
+machine which we call reference system time. We discuss the limitations of our
+proposed protocol and release our code to encourage rigorous research in
+efficient training procedures: https://github.com/JeanKaddour/NoTrainNoGain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty-driven Trajectory Truncation for Data Augmentation in
+  Offline Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04660v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04660v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie Zhang, Jiafei Lyu, Xiaoteng Ma, Jiangpeng Yan, Jun Yang, Le Wan, Xiu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Equipped with the trained environmental dynamics, model-based offline
+reinforcement learning (RL) algorithms can often successfully learn good
+policies from fixed-sized datasets, even some datasets with poor quality.
+Unfortunately, however, it can not be guaranteed that the generated samples
+from the trained dynamics model are reliable (e.g., some synthetic samples may
+lie outside of the support region of the static dataset). To address this
+issue, we propose Trajectory Truncation with Uncertainty (TATU), which
+adaptively truncates the synthetic trajectory if the accumulated uncertainty
+along the trajectory is too large. We theoretically show the performance bound
+of TATU to justify its benefits. To empirically show the advantages of TATU, we
+first combine it with two classical model-based offline RL algorithms, MOPO and
+COMBO. Furthermore, we integrate TATU with several off-the-shelf model-free
+offline RL algorithms, e.g., BCQ. Experimental results on the D4RL benchmark
+show that TATU significantly improves their performance, often by a large
+margin. Code is available here.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Representation of the Magnetic Field Topology in High-Fidelity
+  Plasma Simulations for Machine Learning Applications <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09469v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09469v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioanna Bouri, Fanni Franssila, Markku Alho, Giulia Cozzani, Ivan Zaitsev, Minna Palmroth, Teemu Roos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Topological analysis of the magnetic field in simulated plasmas allows the
+study of various physical phenomena in a wide range of settings. One such
+application is magnetic reconnection, a phenomenon related to the dynamics of
+the magnetic field topology, which is difficult to detect and characterize in
+three dimensions. We propose a scalable pipeline for topological data analysis
+and spatiotemporal graph representation of three-dimensional magnetic vector
+fields. We demonstrate our methods on simulations of the Earth's magnetosphere
+produced by Vlasiator, a supercomputer-scale Vlasov theory-based simulation for
+near-Earth space. The purpose of this work is to challenge the machine learning
+community to explore graph-based machine learning approaches to address a
+largely open scientific problem with wide-ranging potential impact.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures, Accepted at the ICML 2023 Workshop on Machine
+  Learning for Astrophysics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compressible Spectral Mixture Kernels with Sparse Dependency Structures
+  for Gaussian Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1808.00560v9">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1808.00560v9.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Chen, Yijue Dai, Feng Yin, Elena Marchiori, Sergios Theodoridis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spectral mixture (SM) kernels comprise a powerful class of generalized
+kernels for Gaussian processes (GPs) to describe complex patterns. This paper
+introduces model compression and time- and phase (TP) modulated dependency
+structures to the original (SM) kernel for improved generalization of GPs.
+Specifically, by adopting Bienaym\'es identity, we generalize the dependency
+structure through cross-covariance between the SM components. Then, we propose
+a novel SM kernel with a dependency structure (SMD) by using cross-convolution
+between the SM components. Furthermore, we ameliorate the expressiveness of the
+dependency structure by parameterizing it with time and phase delays. The
+dependency structure has clear interpretations in terms of spectral density,
+covariance behavior, and sampling path. To enrich the SMD with effective
+hyperparameter initialization, compressible SM kernel components, and sparse
+dependency structures, we introduce a novel structure adaptation (SA) algorithm
+in the end. A thorough comparative analysis of the SMD on both synthetic and
+real-life applications corroborates its efficacy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Learning Dynamics of Attention Networks <span class="chip">ECAI-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13421v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13421v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul Vashisht, Harish G. Ramaswamy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attention models are typically learned by optimizing one of three standard
+loss functions that are variously called -- soft attention, hard attention, and
+latent variable marginal likelihood (LVML) attention. All three paradigms are
+motivated by the same goal of finding two models -- a `focus' model that
+`selects' the right \textit{segment} of the input and a `classification' model
+that processes the selected segment into the target label. However, they differ
+significantly in the way the selected segments are aggregated, resulting in
+distinct dynamics and final results. We observe a unique signature of models
+learned using these paradigms and explain this as a consequence of the
+evolution of the classification model under gradient descent when the focus
+model is fixed. We also analyze these paradigms in a simple setting and derive
+closed-form expressions for the parameter trajectory under gradient flow. With
+the soft attention loss, the focus model improves quickly at initialization and
+splutters later on. On the other hand, hard attention loss behaves in the
+opposite fashion. Based on our observations, we propose a simple hybrid
+approach that combines the advantages of the different loss functions and
+demonstrates it on a collection of semi-synthetic and real-world datasets
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint: Accepted at ECAI-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deceptive Alignment Monitoring <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10569v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10569v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andres Carranza, Dhruv Pai, Rylan Schaeffer, Arnuv Tandon, Sanmi Koyejo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the capabilities of large machine learning models continue to grow, and as
+the autonomy afforded to such models continues to expand, the spectre of a new
+adversary looms: the models themselves. The threat that a model might behave in
+a seemingly reasonable manner, while secretly and subtly modifying its behavior
+for ulterior reasons is often referred to as deceptive alignment in the AI
+Safety & Alignment communities. Consequently, we call this new direction
+Deceptive Alignment Monitoring. In this work, we identify emerging directions
+in diverse machine learning subfields that we believe will become increasingly
+important and intertwined in the near future for deceptive alignment
+monitoring, and we argue that advances in these fields present both long-term
+challenges and new research opportunities. We conclude by advocating for
+greater involvement by the adversarial machine learning community in these
+emerging directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as BlueSky Oral to 2023 ICML AdvML Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AudioLM: a Language Modeling Approach to Audio Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.03143v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.03143v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zalán Borsos, Raphaël Marinier, Damien Vincent, Eugene Kharitonov, Olivier Pietquin, Matt Sharifi, Dominik Roblek, Olivier Teboul, David Grangier, Marco Tagliasacchi, Neil Zeghidour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce AudioLM, a framework for high-quality audio generation with
+long-term consistency. AudioLM maps the input audio to a sequence of discrete
+tokens and casts audio generation as a language modeling task in this
+representation space. We show how existing audio tokenizers provide different
+trade-offs between reconstruction quality and long-term structure, and we
+propose a hybrid tokenization scheme to achieve both objectives. Namely, we
+leverage the discretized activations of a masked language model pre-trained on
+audio to capture long-term structure and the discrete codes produced by a
+neural audio codec to achieve high-quality synthesis. By training on large
+corpora of raw audio waveforms, AudioLM learns to generate natural and coherent
+continuations given short prompts. When trained on speech, and without any
+transcript or annotation, AudioLM generates syntactically and semantically
+plausible speech continuations while also maintaining speaker identity and
+prosody for unseen speakers. Furthermore, we demonstrate how our approach
+extends beyond speech by generating coherent piano music continuations, despite
+being trained without any symbolic representation of music.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Duet: efficient and scalable hybriD neUral rElation undersTanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13494v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13494v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaixin Zhang, Hongzhi Wang, Yabin Lu, Ziqi Li, Chang Shu, Yu Yan, Donghua Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learned cardinality estimation methods have achieved high precision compared
+to traditional methods. Among learned methods, query-driven approaches face the
+data and workload drift problem for a long time. Although both query-driven and
+hybrid methods are proposed to avoid this problem, even the state-of-art of
+them suffer from high training and estimation costs, limited scalability,
+instability, and long-tailed distribution problem on high cardinality and high
+dimensional tables, which seriously affects the practical application of
+learned cardinality estimators. In this paper, we prove that most of these
+problems are directly caused by the widely used progressive sampling. We solve
+this problem by introducing predicates into the autoregressive model and
+propose Duet, a stable, efficient, and scalable hybrid method to estimate
+cardinality directly without sampling or any non-differentiable process, which
+can not only reduces the inference complexity from $O(n)$ to $O(1)$ compared to
+Naru and UAE but also achieve higher accuracy on high cardinality and high
+dimensional tables. Experimental results show that Duet can achieve all the
+design goals above and be much more practical and even has a lower inference
+cost on CPU than that of most learned methods on GPU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span> Generate Train (PGT): Few-shot Domain Adaption of Retrieval
+  Augmented Generation Models for Open Book Question-Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05915v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05915v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        C. S. Krishna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a framework - Prompt, Generate, Train (PGT) - to efficiently
+develop a generative question-answering model for open-book question-answering
+over a proprietary collection of text documents. The framework adapts a
+retriever augmented generation (RAG) model to the target domain using
+supervised fine-tuning and reinforcement learning with synthetic feedback in a
+few-shot setting. This, we hypothesize, will yield an aligned, uncertainty
+calibrated model that is competitive with GPT-4 based in-context retrieval
+augmented generation in generating relevant answers at lower serving costs. The
+framework's synthetic generation pipeline will generate synthetic training data
+comprising <passage, question, answer> tuples using an open-source LLM and a
+novel consistency filtering scheme. The pipeline will be designed to generate
+both abstractive and extractive questions that span the entire corpus. The
+framework proposes to fine-tune a smaller RAG model comprising a dense
+retriever (ColBERTv2) and a smaller sized LLM on the synthetic dataset. In
+parallel, the framework will train a Reward model to score domain grounded
+answers higher than hallucinated answers using an a priori relevance ordering
+of synthetically assembled samples. In the next phase, the framework will align
+the RAG model with the target domain using reinforcement learning (Proximal
+Policy Optimization). This step may improve the RAG model's ability to generate
+grounded answers and ignore out of domain questions. In the final phase, the
+framework will calibrate the model's uncertainty for extractive
+question-answers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fairness in Recommendation: Foundations, Methods and Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.13619v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.13619v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunqi Li, Hanxiong Chen, Shuyuan Xu, Yingqiang Ge, Juntao Tan, Shuchang Liu, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As one of the most pervasive applications of machine learning, recommender
+systems are playing an important role on assisting human decision making. The
+satisfaction of users and the interests of platforms are closely related to the
+quality of the generated recommendation results. However, as a highly
+data-driven system, recommender system could be affected by data or algorithmic
+bias and thus generate unfair results, which could weaken the reliance of the
+systems. As a result, it is crucial to address the potential unfairness
+problems in recommendation settings. Recently, there has been growing attention
+on fairness considerations in recommender systems with more and more literature
+on approaches to promote fairness in recommendation. However, the studies are
+rather fragmented and lack a systematic organization, thus making it difficult
+to penetrate for new researchers to the domain. This motivates us to provide a
+systematic survey of existing works on fairness in recommendation. This survey
+focuses on the foundations for fairness in recommendation literature. It first
+presents a brief introduction about fairness in basic machine learning tasks
+such as classification and ranking in order to provide a general overview of
+fairness research, as well as introduce the more complex situations and
+challenges that need to be considered when studying fairness in recommender
+systems. After that, the survey will introduce fairness in recommendation with
+a focus on the taxonomies of current fairness definitions, the typical
+techniques for improving fairness, as well as the datasets for fairness studies
+in recommendation. The survey also talks about the challenges and opportunities
+in fairness research with the hope of promoting the fair recommendation
+research area and beyond.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 2 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Table and Image Generation for Investigating Knowledge of Entities in
+  <span class="highlight-title">Pre-train</span>ed Vision and Language Models <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02115v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02115v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hidetaka Kamigaito, Katsuhiko Hayashi, Taro Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a table and image generation task to verify how the
+knowledge about entities acquired from natural language is retained in Vision &
+Language (V&L) models. This task consists of two parts: the first is to
+generate a table containing knowledge about an entity and its related image,
+and the second is to generate an image from an entity with a caption and a
+table containing related knowledge of the entity. In both tasks, the model must
+know the entities used to perform the generation properly. We created the
+Wikipedia Table and Image Generation (WikiTIG) dataset from about 200,000
+infoboxes in English Wikipedia articles to perform the proposed tasks. We
+evaluated the performance on the tasks with respect to the above research
+question using the V&L model OFA, which has achieved state-of-the-art results
+in multiple tasks. Experimental results show that OFA forgets part of its
+entity knowledge by pre-training as a complement to improve the performance of
+image related tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient and Accurate Physics-aware Multiplex Graph Neural Networks for
+  3D Small Molecules and Macromolecule Complexes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02789v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02789v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Zhang, Yang Liu, Lei Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in applying Graph Neural Networks (GNNs) to molecular science
+have showcased the power of learning three-dimensional (3D) structure
+representations with GNNs. However, most existing GNNs suffer from the
+limitations of insufficient modeling of diverse interactions, computational
+expensive operations, and ignorance of vectorial values. Here, we tackle these
+limitations by proposing a novel GNN model, Physics-aware Multiplex Graph
+Neural Network (PaxNet), to efficiently and accurately learn the
+representations of 3D molecules for both small organic compounds and
+macromolecule complexes. PaxNet separates the modeling of local and non-local
+interactions inspired by molecular mechanics, and reduces the expensive
+angle-related computations. Besides scalar properties, PaxNet can also predict
+vectorial properties by learning an associated vector for each atom. To
+evaluate the performance of PaxNet, we compare it with state-of-the-art
+baselines in two tasks. On small molecule dataset for predicting quantum
+chemical properties, PaxNet reduces the prediction error by 15% and uses 73%
+less memory than the best baseline. On macromolecule dataset for predicting
+protein-ligand binding affinities, PaxNet outperforms the best baseline while
+reducing the memory consumption by 33% and the inference time by 85%. Thus,
+PaxNet provides a universal, robust and accurate method for large-scale machine
+learning of molecules. Our code is available at
+https://github.com/zetayue/Physics-aware-Multiplex-GNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rate-optimal Bayesian Simple Regret in Best Arm Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.09885v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.09885v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junpei Komiyama, Kaito Ariu, Masahiro Kato, Chao Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider best arm identification in the multi-armed bandit problem.
+Assuming certain continuity conditions of the prior, we characterize the rate
+of the Bayesian simple regret. Differing from Bayesian regret minimization
+(Lai, 1987), the leading term in the Bayesian simple regret derives from the
+region where the gap between optimal and suboptimal arms is smaller than
+$\sqrt{\frac{\log T}{T}}$. We propose a simple and easy-to-compute algorithm
+with its leading term matching with the lower bound up to a constant factor;
+simulation results support our theoretical findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in Mathematics of Operations Research. Changed the title
+  from the previous version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MetaDT: Meta Decision Tree with Class Hierarchy for Interpretable
+  Few-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.01482v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.01482v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baoquan Zhang, Hao Jiang, Xutao Li, Shanshan Feng, Yunming Ye, Rui Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-Shot Learning (FSL) is a challenging task, which aims to recognize novel
+classes with few examples. Recently, lots of methods have been proposed from
+the perspective of meta-learning and representation learning. However, few
+works focus on the interpretability of FSL decision process. In this paper, we
+take a step towards the interpretable FSL by proposing a novel meta-learning
+based decision tree framework, namely, MetaDT. In particular, the FSL
+interpretability is achieved from two aspects, i.e., a concept aspect and a
+visual aspect. On the concept aspect, we first introduce a tree-like concept
+hierarchy as FSL prior. Then, resorting to the prior, we split each few-shot
+task to a set of subtasks with different concept levels and then perform class
+prediction via a model of decision tree. The advantage of such design is that a
+sequence of high-level concept decisions that lead up to a final class
+prediction can be obtained, which clarifies the FSL decision process. On the
+visual aspect, a set of subtask-specific classifiers with visual attention
+mechanism is designed to perform decision at each node of the decision tree. As
+a result, a subtask-specific heatmap visualization can be obtained to achieve
+the decision interpretability of each tree node. At last, to alleviate the data
+scarcity issue of FSL, we regard the prior of concept hierarchy as an
+undirected graph, and then design a graph convolution-based decision tree
+inference network as our meta-learner to infer parameters of the decision tree.
+Extensive experiments on performance comparison and interpretability analysis
+show superiority of our MetaDT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Teal: Learning-Accelerated Optimization of WAN Traffic Engineering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.13763v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.13763v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiying Xu, Francis Y. Yan, Rachee Singh, Justin T. Chiu, Alexander M. Rush, Minlan Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid expansion of global cloud wide-area networks (WANs) has posed a
+challenge for commercial optimization engines to efficiently solve network
+traffic engineering (TE) problems at scale. Existing acceleration strategies
+decompose TE optimization into concurrent subproblems but realize limited
+parallelism due to an inherent tradeoff between run time and allocation
+performance.
+  We present Teal, a learning-based TE algorithm that leverages the parallel
+processing power of GPUs to accelerate TE control. First, Teal designs a
+flow-centric graph neural network (GNN) to capture WAN connectivity and network
+flows, learning flow features as inputs to downstream allocation. Second, to
+reduce the problem scale and make learning tractable, Teal employs a
+multi-agent reinforcement learning (RL) algorithm to independently allocate
+each traffic demand while optimizing a central TE objective. Finally, Teal
+fine-tunes allocations with ADMM (Alternating Direction Method of Multipliers),
+a highly parallelizable optimization algorithm for reducing constraint
+violations such as overutilized links.
+  We evaluate Teal using traffic matrices from Microsoft's WAN. On a large WAN
+topology with >1,700 nodes, Teal generates near-optimal flow allocations while
+running several orders of magnitude faster than the production optimization
+engine. Compared with other TE acceleration schemes, Teal satisfies 6--32% more
+traffic demand and yields 197--625x speedups.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Kullback-Leibler Maillard Sampling for Multi-armed Bandits with Bounded
+  Rewards 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14989v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14989v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Qin, Kwang-Sung Jun, Chicheng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study $K$-armed bandit problems where the reward distributions of the arms
+are all supported on the $[0,1]$ interval. It has been a challenge to design
+regret-efficient randomized exploration algorithms in this setting. Maillard
+sampling~\cite{maillard13apprentissage}, an attractive alternative to Thompson
+sampling, has recently been shown to achieve competitive regret guarantees in
+the sub-Gaussian reward setting~\cite{bian2022maillard} while maintaining
+closed-form action probabilities, which is useful for offline policy
+evaluation. In this work, we propose the Kullback-Leibler Maillard Sampling
+(KL-MS) algorithm, a natural extension of Maillard sampling for achieving
+KL-style gap-dependent regret bound. We show that KL-MS enjoys the asymptotic
+optimality when the rewards are Bernoulli and has a worst-case regret bound of
+the form $O(\sqrt{\mu^*(1-\mu^*) K T \ln K} + K \ln T)$, where $\mu^*$ is the
+expected reward of the optimal arm, and $T$ is the time horizon length.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Bayesian approach to quantifying uncertainties and improving
+  generalizability in traffic prediction models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05946v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05946v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Agnimitra Sengupta, Sudeepta Mondal, Adway Das, S. Ilgin Guler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep-learning models for traffic data prediction can have superior
+performance in modeling complex functions using a multi-layer architecture.
+However, a major drawback of these approaches is that most of these approaches
+do not offer forecasts with uncertainty estimates, which are essential for
+traffic operations and control. Without uncertainty estimates, it is difficult
+to place any level of trust to the model predictions, and operational
+strategies relying on overconfident predictions can lead to worsening traffic
+conditions. In this study, we propose a Bayesian recurrent neural network
+framework for uncertainty quantification in traffic prediction with higher
+generalizability by introducing spectral normalization to its hidden layers. In
+our paper, we have shown that normalization alters the training process of deep
+neural networks by controlling the model's complexity and reducing the risk of
+overfitting to the training data. This, in turn, helps improve the
+generalization performance of the model on out-of-distribution datasets.
+Results demonstrate that spectral normalization improves uncertainty estimates
+and significantly outperforms both the layer normalization and model without
+normalization in single-step prediction horizons. This improved performance can
+be attributed to the ability of spectral normalization to better localize the
+feature space of the data under perturbations. Our findings are especially
+relevant to traffic management applications, where predicting traffic
+conditions across multiple locations is the goal, but the availability of
+training data from multiple locations is limited. Spectral normalization,
+therefore, provides a more generalizable approach that can effectively capture
+the underlying patterns in traffic data without requiring location-specific
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compositional federated learning: Applications in distributionally
+  robust averaging and meta learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.11264v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.11264v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feihu Huang, Junyi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the paper, we propose an effective and efficient Compositional Federated
+Learning (ComFedL) algorithm for solving a new compositional Federated Learning
+(FL) framework, which frequently appears in many data mining and machine
+learning problems with a hierarchical structure such as distributionally robust
+FL and model-agnostic meta learning (MAML). Moreover, we study the convergence
+analysis of our ComFedL algorithm under some mild conditions, and prove that it
+achieves a convergence rate of $O(\frac{1}{\sqrt{T}})$, where $T$ denotes the
+number of iteration. To the best of our knowledge, our new Compositional FL
+framework is the first work to bridge federated learning with composition
+stochastic optimization. In particular, we first transform the distributionally
+robust FL (i.e., a minimax optimization problem) into a simple composition
+optimization problem by using KL divergence regularization. At the same time,
+we also first transform the distribution-agnostic MAML problem (i.e., a minimax
+optimization problem) into a simple yet effective composition optimization
+problem. Finally, we apply two popular machine learning tasks, i.e.,
+distributionally robust FL and MAML to demonstrate the effectiveness of our
+algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nonsmooth Nonconvex-Nonconcave Minimax Optimization: Primal-Dual
+  Balancing and Iteration Complexity Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.10825v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.10825v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajin Li, Linglingzhi Zhu, Anthony Man-Cho So
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nonconvex-nonconcave minimax optimization has gained widespread interest over
+the last decade. However, most existing works focus on variants of gradient
+descent-ascent (GDA) algorithms, which are only applicable to smooth
+nonconvex-concave settings. To address this limitation, we propose a novel
+algorithm named smoothed proximal linear descent-ascent (smoothed PLDA), which
+can effectively handle a broad range of structured nonsmooth
+nonconvex-nonconcave minimax problems. Specifically, we consider the setting
+where the primal function has a nonsmooth composite structure and the dual
+function possesses the Kurdyka-Lojasiewicz (KL) property with exponent $\theta
+\in [0,1)$. We introduce a novel convergence analysis framework for smoothed
+PLDA, the key components of which are our newly developed nonsmooth primal
+error bound and dual error bound. Using this framework, we show that smoothed
+PLDA can find both $\epsilon$-game-stationary points and
+$\epsilon$-optimization-stationary points of the problems of interest in
+$\mathcal{O}(\epsilon^{-2\max\{2\theta,1\}})$ iterations. Furthermore, when
+$\theta \in [0,\frac{1}{2}]$, smoothed PLDA achieves the optimal iteration
+complexity of $\mathcal{O}(\epsilon^{-2})$. To further demonstrate the
+effectiveness and wide applicability of our analysis framework, we show that
+certain max-structured problem possesses the KL property with exponent
+$\theta=0$ under mild assumptions. As a by-product, we establish
+algorithm-independent quantitative relationships among various stationarity
+concepts, which may be of independent interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Generalization Effects of Linear Transformations in Data
+  Augmentation <span class="chip">ICML 2020</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2005.00695v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2005.00695v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sen Wu, Hongyang R. Zhang, Gregory Valiant, Christopher Ré
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation is a powerful technique to improve performance in
+applications such as image and text classification tasks. Yet, there is little
+rigorous understanding of why and how various augmentations work. In this work,
+we consider a family of linear transformations and study their effects on the
+ridge estimator in an over-parametrized linear regression setting. First, we
+show that transformations that preserve the labels of the data can improve
+estimation by enlarging the span of the training data. Second, we show that
+transformations that mix data can improve estimation by playing a
+regularization effect. Finally, we validate our theoretical insights on MNIST.
+Based on the insights, we propose an augmentation scheme that searches over the
+space of transformations by how uncertain the model is about the transformed
+data. We validate our proposed scheme on image and text datasets. For example,
+our method outperforms random sampling methods by 1.24% on CIFAR-100 using
+Wide-ResNet-28-10. Furthermore, we achieve comparable accuracy to the SoTA
+Adversarial AutoAugment on CIFAR-10, CIFAR-100, SVHN, and ImageNet datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages. Appeared in ICML 2020</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Vulnerability of Fairness Constrained Learning to Malicious Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11892v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11892v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Avrim Blum, Princewill Okoroafor, Aadirupa Saha, Kevin Stangl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the vulnerability of fairness-constrained learning to small
+amounts of malicious noise in the training data. Konstantinov and Lampert
+(2021) initiated the study of this question and presented negative results
+showing there exist data distributions where for several fairness constraints,
+any proper learner will exhibit high vulnerability when group sizes are
+imbalanced. Here, we present a more optimistic view, showing that if we allow
+randomized classifiers, then the landscape is much more nuanced. For example,
+for Demographic Parity we show we can incur only a $\Theta(\alpha)$ loss in
+accuracy, where $\alpha$ is the malicious noise rate, matching the best
+possible even without fairness constraints. For Equal Opportunity, we show we
+can incur an $O(\sqrt{\alpha})$ loss, and give a matching
+$\Omega(\sqrt{\alpha})$lower bound. In contrast, Konstantinov and Lampert
+(2021) showed for proper learners the loss in accuracy for both notions is
+$\Omega(1)$. The key technical novelty of our work is how randomization can
+bypass simple "tricks" an adversary can use to amplify his power. We also
+consider additional fairness notions including Equalized Odds and Calibration.
+For these fairness notions, the excess accuracy clusters into three natural
+regimes $O(\alpha)$,$O(\sqrt{\alpha})$ and $O(1)$. These results provide a more
+fine-grained view of the sensitivity of fairness-constrained learning to
+adversarial noise in training data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Networks for Scalar Input and Functional Output 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.05776v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.05776v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sidi Wu, Cédric Beaulac, Jiguo Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The regression of a functional response on a set of scalar predictors can be
+a challenging task, especially if there is a large number of predictors, or the
+relationship between those predictors and the response is nonlinear. In this
+work, we propose a solution to this problem: a feed-forward neural network (NN)
+designed to predict a functional response using scalar inputs. First, we
+transform the functional response to a finite-dimensional representation and
+construct an NN that outputs this representation. Then, we propose to modify
+the output of an NN via the objective function and introduce different
+objective functions for network training. The proposed models are suited for
+both regularly and irregularly spaced data, and a roughness penalty can be
+further applied to control the smoothness of the predicted curve. The
+difficulty in implementing both those features lies in the definition of
+objective functions that can be back-propagated. In our experiments, we
+demonstrate that our model outperforms the conventional function-on-scalar
+regression model in multiple scenarios while computationally scaling better
+with the dimension of the predictors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spectral learning of Bernoulli linear dynamical systems models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.02060v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.02060v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iris R. Stone, Yotam Sagiv, Il Memming Park, Jonathan W. Pillow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Latent linear dynamical systems with Bernoulli observations provide a
+powerful modeling framework for identifying the temporal dynamics underlying
+binary time series data, which arise in a variety of contexts such as binary
+decision-making and discrete stochastic processes (e.g., binned neural spike
+trains). Here we develop a spectral learning method for fast, efficient fitting
+of probit-Bernoulli latent linear dynamical system (LDS) models. Our approach
+extends traditional subspace identification methods to the Bernoulli setting
+via a transformation of the first and second sample moments. This results in a
+robust, fixed-cost estimator that avoids the hazards of local optima and the
+long computation time of iterative fitting procedures like the
+expectation-maximization (EM) algorithm. In regimes where data is limited or
+assumptions about the statistical structure of the data are not met, we
+demonstrate that the spectral estimate provides a good initialization for
+Laplace-EM fitting. Finally, we show that the estimator provides substantial
+benefits to real world settings by analyzing data from mice performing a
+sensory decision-making task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Transactions on Machine Learning Research
+  (https://jmlr.org/tmlr/papers/)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the non-efficient PAC learnability of conjunctive queries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.10255v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.10255v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Balder ten Cate, Maurice Funk, Jean Christoph Jung, Carsten Lutz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This note serves three purposes: (i) we provide a self-contained exposition
+of the fact that conjunctive queries are not efficiently learnable in the
+Probably-Approximately-Correct (PAC) model, paying clear attention to the
+complicating fact that this concept class lacks the polynomial-size fitting
+property, a property that is tacitly assumed in much of the computational
+learning theory literature; (ii) we establish a strong negative PAC
+learnability result that applies to many restricted classes of conjunctive
+queries (CQs), including acyclic CQs for a wide range of notions of
+"acyclicity"; (iii) we show that CQs (and UCQs) are efficiently PAC learnable
+with membership queries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in Information Processing Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning a Generic Value-Selection Heuristic Inside a Constraint
+  Programming Solver 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.01913v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.01913v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Marty, Tristan François, Pierre Tessier, Louis Gauthier, Louis-Martin Rousseau, Quentin Cappart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Constraint programming is known for being an efficient approach for solving
+combinatorial problems. Important design choices in a solver are the branching
+heuristics, which are designed to lead the search to the best solutions in a
+minimum amount of time. However, developing these heuristics is a
+time-consuming process that requires problem-specific expertise. This
+observation has motivated many efforts to use machine learning to automatically
+learn efficient heuristics without expert intervention. To the best of our
+knowledge, it is still an open research question. Although several generic
+variable-selection heuristics are available in the literature, the options for
+a generic value-selection heuristic are more scarce. In this paper, we propose
+to tackle this issue by introducing a generic learning procedure that can be
+used to obtain a value-selection heuristic inside a constraint programming
+solver. This has been achieved thanks to the combination of a deep Q-learning
+algorithm, a tailored reward signal, and a heterogeneous graph neural network
+architecture. Experiments on graph coloring, maximum independent set, and
+maximum cut problems show that our framework is able to find better solutions
+close to optimality without requiring a large amounts of backtracks while being
+generic.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual <span class="highlight-title">Pre-train</span>ing for Navigation: What Can We Learn from Noise? <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.00052v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.00052v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanwei Wang, Ching-Yun Ko, Pulkit Agrawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One powerful paradigm in visual navigation is to predict actions from
+observations directly. Training such an end-to-end system allows
+representations useful for downstream tasks to emerge automatically. However,
+the lack of inductive bias makes this system data inefficient. We hypothesize a
+sufficient representation of the current view and the goal view for a
+navigation policy can be learned by predicting the location and size of a crop
+of the current view that corresponds to the goal. We further show that training
+such random crop prediction in a self-supervised fashion purely on synthetic
+noise images transfers well to natural home images. The learned representation
+can then be bootstrapped to learn a navigation policy efficiently with little
+interaction data. The code is available at https://yanweiw.github.io/noise2ptz
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Task Automata for Reinforcement Learning using Hidden Markov
+  Models <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.11838v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.11838v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Abate, Yousif Almulla, James Fox, David Hyland, Michael Wooldridge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training reinforcement learning (RL) agents using scalar reward signals is
+often infeasible when an environment has sparse and non-Markovian rewards.
+Moreover, handcrafting these reward functions before training is prone to
+misspecification, especially when the environment's dynamics are only partially
+known. This paper proposes a novel pipeline for learning non-Markovian task
+specifications as succinct finite-state `task automata' from episodes of agent
+experience within unknown environments. We leverage two key algorithmic
+insights. First, we learn a product MDP, a model composed of the
+specification's automaton and the environment's MDP (both initially unknown),
+by treating the product MDP as a partially observable MDP and using the
+well-known Baum-Welch algorithm for learning hidden Markov models. Second, we
+propose a novel method for distilling the task automaton (assumed to be a
+deterministic finite automaton) from the learnt product MDP. Our learnt task
+automaton enables the decomposition of a task into its constituent sub-tasks,
+which improves the rate at which an RL agent can later synthesise an optimal
+policy. It also provides an interpretable encoding of high-level environmental
+and task features, so a human can readily verify that the agent has learnt
+coherent tasks with no misspecifications. In addition, we take steps towards
+ensuring that the learnt automaton is environment-agnostic, making it
+well-suited for use in transfer learning. Finally, we provide experimental
+results compared with two baselines to illustrate our algorithm's performance
+in different environments and tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 7 figures, Accepted to the 26th European Conference on
+  Artificial Intelligence (ECAI 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PlaSma: Making Small Language Models Better Procedural Knowledge Models
+  for (Counterfactual) Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19472v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19472v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Faeze Brahman, Chandra Bhagavatula, Valentina Pyatkin, Jena D. Hwang, Xiang Lorraine Li, Hirona J. Arai, Soumya Sanyal, Keisuke Sakaguchi, Xiang Ren, Yejin Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Procedural planning, which entails decomposing a high-level goal into a
+sequence of temporally ordered steps, is an important yet intricate task for
+machines. It involves integrating common-sense knowledge to reason about
+complex contextualized situations that are often counterfactual, e.g.
+"scheduling a doctor's appointment without a phone". While current approaches
+show encouraging results using large language models (LLMs), they are hindered
+by drawbacks such as costly API calls and reproducibility issues. In this
+paper, we advocate planning using smaller language models. We present PlaSma, a
+novel two-pronged approach to endow small language models with procedural
+knowledge and (counterfactual) planning capabilities. More concretely, we
+develop symbolic procedural knowledge distillation to enhance the implicit
+knowledge in small language models and an inference-time algorithm to
+facilitate more structured and accurate reasoning. In addition, we introduce a
+novel task, Counterfactual Planning, that requires a revision of a plan to cope
+with a counterfactual situation. In both the original and counterfactual
+setting, we show that orders-of-magnitude smaller models (770M-11B parameters)
+can compete and often surpass their larger teacher models' capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>cited new paper, 27 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reasons for the Superiority of Stochastic Estimators over Deterministic
+  Ones: Robustness, Consistency and Perceptual Quality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08944v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08944v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guy Ohayon, Theo Adrai, Michael Elad, Tomer Michaeli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic restoration algorithms allow to explore the space of solutions
+that correspond to the degraded input. In this paper we reveal additional
+fundamental advantages of stochastic methods over deterministic ones, which
+further motivate their use. First, we prove that any restoration algorithm that
+attains perfect perceptual quality and whose outputs are consistent with the
+input must be a posterior sampler, and is thus required to be stochastic.
+Second, we illustrate that while deterministic restoration algorithms may
+attain high perceptual quality, this can be achieved only by filling up the
+space of all possible source images using an extremely sensitive mapping, which
+makes them highly vulnerable to adversarial attacks. Indeed, we show that
+enforcing deterministic models to be robust to such attacks profoundly hinders
+their perceptual quality, while robustifying stochastic models hardly
+influences their perceptual quality, and improves their output variability.
+These findings provide a motivation to foster progress in stochastic
+restoration methods, paving the way to better recovery algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">9</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WavJourney: Compositional Audio Creation with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xubo Liu, Zhongkai Zhu, Haohe Liu, Yi Yuan, Meng Cui, Qiushi Huang, Jinhua Liang, Yin Cao, Qiuqiang Kong, Mark D. Plumbley, Wenwu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown great promise in integrating diverse
+expert models to tackle intricate language and vision tasks. Despite their
+significance in advancing the field of Artificial Intelligence Generated
+Content (AIGC), their potential in intelligent audio content creation remains
+unexplored. In this work, we tackle the problem of creating audio content with
+storylines encompassing speech, music, and sound effects, guided by text
+instructions. We present WavJourney, a system that leverages LLMs to connect
+various audio models for audio content generation. Given a text description of
+an auditory scene, WavJourney first prompts LLMs to generate a structured
+script dedicated to audio storytelling. The audio script incorporates diverse
+audio elements, organized based on their spatio-temporal relationships. As a
+conceptual representation of audio, the audio script provides an interactive
+and interpretable rationale for human engagement. Afterward, the audio script
+is fed into a script compiler, converting it into a computer program. Each line
+of the program calls a task-specific audio generation model or computational
+operation function (e.g., concatenate, mix). The computer program is then
+executed to obtain an explainable solution for audio generation. We demonstrate
+the practicality of WavJourney across diverse real-world scenarios, including
+science fiction, education, and radio play. The explainable and interactive
+design of WavJourney fosters human-machine co-creation in multi-round
+dialogues, enhancing creative control and adaptability in audio production.
+WavJourney audiolizes the human imagination, opening up new avenues for
+creativity in multimedia content creation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://audio-agi.github.io/WavJourney_demopage/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural-based Cross-modal Search and Retrieval of Artwork 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14244v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14244v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Gong, Georgina Cosma, Axel Finke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creating an intelligent search and retrieval system for artwork images,
+particularly paintings, is crucial for documenting cultural heritage, fostering
+wider public engagement, and advancing artistic analysis and interpretation.
+Visual-Semantic Embedding (VSE) networks are deep learning models used for
+information retrieval, which learn joint representations of textual and visual
+data, enabling 1) cross-modal search and retrieval tasks, such as image-to-text
+and text-to-image retrieval; and 2) relation-focused retrieval to capture
+entity relationships and provide more contextually relevant search results.
+Although VSE networks have played a significant role in cross-modal information
+retrieval, their application to painting datasets, such as ArtUK, remains
+unexplored. This paper introduces BoonArt, a VSE-based cross-modal search
+engine that allows users to search for images using textual queries, and to
+obtain textual descriptions along with the corresponding images when using
+image queries. The performance of BoonArt was evaluated using the ArtUK
+dataset. Experimental evaluations revealed that BoonArt achieved 97% Recall@10
+for image-to-text retrieval, and 97.4% Recall@10 for text-to-image Retrieval.
+By bridging the gap between textual and visual modalities, BoonArt provides a
+much-improved search performance compared to traditional search engines, such
+as the one provided by the ArtUK website. BoonArt can be utilised to work with
+other artwork datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boon: A Neural Search Engine for Cross-Modal Information Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yan Gong, Georgina Cosma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual-Semantic Embedding (VSE) networks can help search engines better
+understand the meaning behind visual content and associate it with relevant
+textual information, leading to more accurate search results. VSE networks can
+be used in cross-modal search engines to embed image and textual descriptions
+in a shared space, enabling image-to-text and text-to-image retrieval tasks.
+However, the full potential of VSE networks for search engines has yet to be
+fully explored. This paper presents Boon, a novel cross-modal search engine
+that combines two state-of-the-art networks: the GPT-3.5-turbo large language
+model, and the VSE network VITR (VIsion Transformers with Relation-focused
+learning) to enhance the engine's capabilities in extracting and reasoning with
+regional relationships in images. VITR employs encoders from CLIP that were
+trained with 400 million image-description pairs and it was fine-turned on the
+RefCOCOg dataset. Boon's neural-based components serve as its main
+functionalities: 1) a 'cross-modal search engine' that enables end-users to
+perform image-to-text and text-to-image retrieval. 2) a 'multi-lingual
+conversational AI' component that enables the end-user to converse about one or
+more images selected by the end-user. Such a feature makes the search engine
+accessible to a wide audience, including those with visual impairments. 3) Boon
+is multi-lingual and can take queries and handle conversations about images in
+multiple languages. Boon was implemented using the Django and PyTorch
+frameworks. The interface and capabilities of the Boon search engine are
+demonstrated using the RefCOCOg dataset, and the engine's ability to search for
+multimedia through the web is facilitated by Google's API.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A semantics-driven methodology for high-quality image annotation <span class="chip">ECAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fausto Giunchiglia, Mayukh Bagchi, Xiaolei Diao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work in Machine Learning and Computer Vision has highlighted the
+presence of various types of systematic flaws inside ground truth object
+recognition benchmark datasets. Our basic tenet is that these flaws are rooted
+in the many-to-many mappings which exist between the visual information encoded
+in images and the intended semantics of the labels annotating them. The net
+consequence is that the current annotation process is largely under-specified,
+thus leaving too much freedom to the subjective judgment of annotators. In this
+paper, we propose vTelos, an integrated Natural Language Processing, Knowledge
+Representation, and Computer Vision methodology whose main goal is to make
+explicit the (otherwise implicit) intended annotation semantics, thus
+minimizing the number and role of subjective choices. A key element of vTelos
+is the exploitation of the WordNet lexico-semantic hierarchy as the main means
+for providing the meaning of natural language labels and, as a consequence, for
+driving the annotation of images based on the objects and the visual properties
+they depict. The methodology is validated on images populating a subset of the
+ImageNet hierarchy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted @ 26th European Conference on Artificial Intelligence (ECAI)
+  2023, Krak\'ow, Poland</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis of Video Quality <span class="highlight-title">Dataset</span>s via Design of Minimalistic Video
+  Quality Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Sun, Wen Wen, Xiongkuo Min, Long Lan, Guangtao Zhai, Kede Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Blind video quality assessment (BVQA) plays an indispensable role in
+monitoring and improving the end-users' viewing experience in various
+real-world video-enabled media applications. As an experimental field, the
+improvements of BVQA models have been measured primarily on a few human-rated
+VQA datasets. Thus, it is crucial to gain a better understanding of existing
+VQA datasets in order to properly evaluate the current progress in BVQA.
+Towards this goal, we conduct a first-of-its-kind computational analysis of VQA
+datasets via designing minimalistic BVQA models. By minimalistic, we restrict
+our family of BVQA models to build only upon basic blocks: a video preprocessor
+(for aggressive spatiotemporal downsampling), a spatial quality analyzer, an
+optional temporal quality analyzer, and a quality regressor, all with the
+simplest possible instantiations. By comparing the quality prediction
+performance of different model variants on eight VQA datasets with realistic
+distortions, we find that nearly all datasets suffer from the easy dataset
+problem of varying severity, some of which even admit blind image quality
+assessment (BIQA) solutions. We additionally justify our claims by contrasting
+our model generalizability on these VQA datasets, and by ablating a dizzying
+set of BVQA design choices related to the basic building blocks. Our results
+cast doubt on the current progress in BVQA, and meanwhile shed light on good
+practices of constructing next-generation VQA datasets and models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modality-Agnostic Audio-Visual Deepfake Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14491v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14491v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cai Yu, Peng Chen, Jiahe Tian, Jin Liu, Jiao Dai, Xi Wang, Yesheng Chai, Jizhong Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As AI-generated content (AIGC) thrives, Deepfakes have expanded from
+single-modality falsification to cross-modal fake content creation, where
+either audio or visual components can be manipulated. While using two unimodal
+detectors can detect audio-visual deepfakes, cross-modal forgery clues could be
+overlooked. Existing multimodal deepfake detection methods typically establish
+correspondence between the audio and visual modalities for binary real/fake
+classification, and require the co-occurrence of both modalities. However, in
+real-world multi-modal applications, missing modality scenarios may occur where
+either modality is unavailable. In such cases, audio-visual detection methods
+are less practical than two independent unimodal methods. Consequently, the
+detector can not always obtain the number or type of manipulated modalities
+beforehand, necessitating a fake-modality-agnostic audio-visual detector. In
+this work, we propose a unified fake-modality-agnostic scenarios framework that
+enables the detection of multimodal deepfakes and handles missing modalities
+cases, no matter the manipulation hidden in audio, video, or even cross-modal
+forms. To enhance the modeling of cross-modal forgery clues, we choose
+audio-visual speech recognition (AVSR) as a preceding task, which effectively
+extracts speech correlation across modalities, which is difficult for deepfakes
+to reproduce. Additionally, we propose a dual-label detection approach that
+follows the structure of AVSR to support the independent detection of each
+modality. Extensive experiments show that our scheme not only outperforms other
+state-of-the-art binary detection methods across all three audio-visual
+datasets but also achieves satisfying performance on detection
+modality-agnostic audio/video fakes. Moreover, it even surpasses the joint use
+of two unimodal methods in the presence of missing modality cases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Who is Speaking Actually? Robust and Versatile Speaker Traceability for
+  Voice Conversion <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05152v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05152v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanzhen Ren, Hongcheng Zhu, Liming Zhai, Zongkun Sun, Rubing Shen, Lina Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Voice conversion (VC), as a voice style transfer technology, is becoming
+increasingly prevalent while raising serious concerns about its illegal use.
+Proactively tracing the origins of VC-generated speeches, i.e., speaker
+traceability, can prevent the misuse of VC, but unfortunately has not been
+extensively studied. In this paper, we are the first to investigate the speaker
+traceability for VC and propose a traceable VC framework named VoxTracer. Our
+VoxTracer is similar to but beyond the paradigm of audio watermarking. We first
+use unique speaker embedding to represent speaker identity. Then we design a
+VAE-Glow structure, in which the hiding process imperceptibly integrates the
+source speaker identity into the VC, and the tracing process accurately
+recovers the source speaker identity and even the source speech in spite of
+severe speech quality degradation. To address the speech mismatch between the
+hiding and tracing processes affected by different distortions, we also adopt
+an asynchronous training strategy to optimize the VAE-Glow models. The
+VoxTracer is versatile enough to be applied to arbitrary VC methods and popular
+audio coding standards. Extensive experiments demonstrate that the VoxTracer
+achieves not only high imperceptibility in hiding, but also nearly 100% tracing
+accuracy against various types of audio lossy compressions (AAC, MP3, Opus and
+SILK) with a broad range of bitrates (16 kbps - 128 kbps) even in a very short
+time duration (0.74s). Our speech demo is available at
+https://anonymous.4open.science/w/DEMOofVoxTracer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>has been accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Priors in Deep Image Restoration and Enhancement: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02070v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02070v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunfan Lu, Yiqi Lin, Hao Wu, Yunhao Luo, Xu Zheng, Hui Xiong, Lin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image restoration and enhancement is a process of improving the image quality
+by removing degradations, such as noise, blur, and resolution degradation. Deep
+learning (DL) has recently been applied to image restoration and enhancement.
+Due to its ill-posed property, plenty of works have been explored priors to
+facilitate training deep neural networks (DNNs). However, the importance of
+priors has not been systematically studied and analyzed by far in the research
+community. Therefore, this paper serves as the first study that provides a
+comprehensive overview of recent advancements in priors for deep image
+restoration and enhancement. Our work covers five primary contents: (1) A
+theoretical analysis of priors for deep image restoration and enhancement; (2)
+A hierarchical and structural taxonomy of priors commonly used in the DL-based
+methods; (3) An insightful discussion on each prior regarding its principle,
+potential, and applications; (4) A summary of crucial problems by highlighting
+the potential future directions, especially adopting the large-scale foundation
+models as prior, to spark more research in the community; (5) An open-source
+repository that provides a taxonomy of all mentioned works and code links.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAS Video-QA: Self-Adaptive Sampling for Efficient Video
+  Question-Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04192v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04192v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Han, Hui Chen, Min-Yen Kan, Soujanya Poria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video question--answering is a fundamental task in the field of video
+understanding. Although current vision--language models (VLMs) equipped with
+Video Transformers have enabled temporal modeling and yielded superior results,
+they are at the cost of huge computational power and thus too expensive to
+deploy in real-time application scenarios. An economical workaround only
+samples a small portion of frames to represent the main content of that video
+and tune an image--text model on these sampled frames. Recent video
+understanding models usually randomly sample a set of frames or clips,
+regardless of internal correlations between their visual contents, nor their
+relevance to the problem. We argue that such kinds of aimless sampling may omit
+the key frames from which the correct answer can be deduced, and the situation
+gets worse when the sampling sparsity increases, which always happens as the
+video lengths increase. To mitigate this issue, we propose two frame sampling
+strategies, namely the most domain frames (MDF) and most implied frames (MIF),
+to maximally preserve those frames that are most likely vital to the given
+questions. MDF passively minimizes the risk of key frame omission in a
+bootstrap manner, while MIS actively searches key frames customized for each
+video--question pair with the assistance of auxiliary models. The experimental
+results on three public datasets from three advanced VLMs (CLIP, GIT and
+All-in-one) demonstrate that our proposed strategies can boost the performance
+for image--text pretrained models. The source codes pertaining to the method
+proposed in this paper are publicly available at
+https://github.com/declare-lab/sas-vqa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-07-25T00:00:00Z">2023-07-25</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">40</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Large Language Models for Radiology Natural Language
+  Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13693v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13693v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengliang Liu, Tianyang Zhong, Yiwei Li, Yutong Zhang, Yi Pan, Zihao Zhao, Peixin Dong, Chao Cao, Yuxiao Liu, Peng Shu, Yaonai Wei, Zihao Wu, Chong Ma, Jiaqi Wang, Sheng Wang, Mengyue Zhou, Zuowei Jiang, Chunlin Li, Shaochen Xu, Lu Zhang, Haixing Dai, Kai Zhang, Xu Liu, Lin Zhao, Peilong Wang, Pingkun Yan, Jun Liu, Bao Ge, Lichao Sun, Dajiang Zhu, Xiang Li, Wei Liu, Xiaoyan Cai, Xintao Hu, Xi Jiang, Shu Zhang, Xin Zhang, Tuo Zhang, Shijie Zhao, Quanzheng Li, Hongtu Zhu, Dinggang Shen, Tianming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of large language models (LLMs) has marked a pivotal shift in the
+field of natural language processing (NLP). LLMs have revolutionized a
+multitude of domains, and they have made a significant impact in the medical
+field. Large language models are now more abundant than ever, and many of these
+models exhibit bilingual capabilities, proficient in both English and Chinese.
+However, a comprehensive evaluation of these models remains to be conducted.
+This lack of assessment is especially apparent within the context of radiology
+NLP. This study seeks to bridge this gap by critically evaluating thirty two
+LLMs in interpreting radiology reports, a crucial component of radiology NLP.
+Specifically, the ability to derive impressions from radiologic findings is
+assessed. The outcomes of this evaluation provide key insights into the
+performance, strengths, and weaknesses of these LLMs, informing their practical
+applications within the medical domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ARB: Advanced Reasoning Benchmark for Large Language Models <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13692v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13692v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomohiro Sawada, Daniel Paleka, Alexander Havrilla, Pranav Tadepalli, Paula Vidas, Alexander Kranias, John J. Nay, Kshitij Gupta, Aran Komatsuzaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable performance on
+various quantitative reasoning and knowledge benchmarks. However, many of these
+benchmarks are losing utility as LLMs get increasingly high scores, despite not
+yet reaching expert performance in these domains. We introduce ARB, a novel
+benchmark composed of advanced reasoning problems in multiple fields. ARB
+presents a more challenging test than prior benchmarks, featuring problems in
+mathematics, physics, biology, chemistry, and law. As a subset of ARB, we
+introduce a challenging set of math and physics problems which require advanced
+symbolic reasoning and domain knowledge. We evaluate recent models such as
+GPT-4 and Claude on ARB and demonstrate that current models score well below
+50% on more demanding tasks. In order to improve both automatic and assisted
+evaluation capabilities, we introduce a rubric-based evaluation approach,
+allowing GPT-4 to score its own intermediate reasoning steps. Further, we
+conduct a human evaluation of the symbolic subset of ARB, finding promising
+agreement between annotators and GPT-4 rubric evaluation scores.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to NeurIPS Datasets and Benchmarks Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comprehensive Evaluation and Analysis Study for Chinese Spelling Check 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xunjian Yin, Xiaojun Wan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the development of pre-trained models and the incorporation of phonetic
+and graphic information, neural models have achieved high scores in Chinese
+Spelling Check (CSC). However, it does not provide a comprehensive reflection
+of the models' capability due to the limited test sets. In this study, we
+abstract the representative model paradigm, implement it with nine structures
+and experiment them on comprehensive test sets we constructed with different
+purposes. We perform a detailed analysis of the results and find that: 1)
+Fusing phonetic and graphic information reasonably is effective for CSC. 2)
+Models are sensitive to the error distribution of the test set, which reflects
+the shortcomings of models and reveals the direction we should work on. 3)
+Whether or not the errors and contexts have been seen has a significant impact
+on models. 4) The commonly used benchmark, SIGHAN, can not reliably evaluate
+models' performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contributions to the Improvement of Question Answering Systems in the
+  Biomedical Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mourad Sarrouti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This thesis work falls within the framework of question answering (QA) in the
+biomedical domain where several specific challenges are addressed, such as
+specialized lexicons and terminologies, the types of treated questions, and the
+characteristics of targeted documents. We are particularly interested in
+studying and improving methods that aim at finding accurate and short answers
+to biomedical natural language questions from a large scale of biomedical
+textual documents in English. QA aims at providing inquirers with direct, short
+and precise answers to their natural language questions. In this Ph.D. thesis,
+we propose four contributions to improve the performance of QA in the
+biomedical domain. In our first contribution, we propose a machine
+learning-based method for question type classification to determine the types
+of given questions which enable to a biomedical QA system to use the
+appropriate answer extraction method. We also propose an another machine
+learning-based method to assign one or more topics (e.g., pharmacological,
+test, treatment, etc.) to given questions in order to determine the semantic
+types of the expected answers which are very useful in generating specific
+answer retrieval strategies. In the second contribution, we first propose a
+document retrieval method to retrieve a set of relevant documents that are
+likely to contain the answers to biomedical questions from the MEDLINE
+database. We then present a passage retrieval method to retrieve a set of
+relevant passages to questions. In the third contribution, we propose specific
+answer extraction methods to generate both exact and ideal answers. Finally, in
+the fourth contribution, we develop a fully automated semantic biomedical QA
+system called SemBioNLQA which is able to deal with a variety of natural
+language questions and to generate appropriate answers by providing both exact
+and ideal answers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Doctoral thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">GPT</span>-3 Models are Few-Shot Financial Reasoners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raul Salles de Padua, Imran Qureshi, Mustafa U. Karakaplan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Financial analysis is an important tool for evaluating company performance.
+Practitioners work to answer financial questions to make profitable investment
+decisions, and use advanced quantitative analyses to do so. As a result,
+Financial Question Answering (QA) is a question answering task that requires
+deep reasoning about numbers. Furthermore, it is unknown how well pre-trained
+language models can reason in the financial domain. The current
+state-of-the-art requires a retriever to collect relevant facts about the
+financial question from the text and a generator to produce a valid financial
+program and a final answer. However, recently large language models like GPT-3
+have achieved state-of-the-art performance on wide variety of tasks with just a
+few shot examples. We run several experiments with GPT-3 and find that a
+separate retrieval model and logic engine continue to be essential components
+to achieving SOTA performance in this task, particularly due to the precise
+nature of financial questions and the complex information stored in financial
+documents. With this understanding, our refined prompt-engineering approach on
+GPT-3 achieves near SOTA accuracy without any fine-tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XDLM: Cross-lingual Diffusion Language Model for Machine Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13560v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13560v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linyao Chen, Aosong Feng, Boming Yang, Zihui Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, diffusion models have excelled in image generation tasks and have
+also been applied to neural language processing (NLP) for controllable text
+generation. However, the application of diffusion models in a cross-lingual
+setting is less unexplored. Additionally, while pretraining with diffusion
+models has been studied within a single language, the potential of
+cross-lingual pretraining remains understudied. To address these gaps, we
+propose XDLM, a novel Cross-lingual diffusion model for machine translation,
+consisting of pretraining and fine-tuning stages. In the pretraining stage, we
+propose TLDM, a new training objective for mastering the mapping between
+different languages; in the fine-tuning stage, we build up the translation
+system based on the pretrained model. We evaluate the result on several machine
+translation benchmarks and outperformed both diffusion and Transformer
+baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FacTool: Factuality Detection in Generative AI -- A Tool Augmented
+  Framework for Multi-Task and Multi-Domain Scenarios 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13528v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13528v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        I-Chun Chern, Steffi Chern, Shiqi Chen, Weizhe Yuan, Kehua Feng, Chunting Zhou, Junxian He, Graham Neubig, Pengfei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of generative pre-trained models has facilitated the synthesis
+of high-quality text, but it has also posed challenges in identifying factual
+errors in the generated text. In particular: (1) A wider range of tasks now
+face an increasing risk of containing factual errors when handled by generative
+models. (2) Generated texts tend to be lengthy and lack a clearly defined
+granularity for individual facts. (3) There is a scarcity of explicit evidence
+available during the process of fact checking. With the above challenges in
+mind, in this paper, we propose FacTool, a task and domain agnostic framework
+for detecting factual errors of texts generated by large language models (e.g.,
+ChatGPT). Experiments on four different tasks (knowledge-based QA, code
+generation, mathematical reasoning, and scientific literature review) show the
+efficacy of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zshot: An Open-source Framework for Zero-Shot Named Entity Recognition
+  and Relation Extraction <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13497v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13497v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriele Picco, Marcos Martínez Galindo, Alberto Purpura, Leopold Fuchs, Vanessa López, Hoang Thanh Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Zero-Shot Learning (ZSL) task pertains to the identification of entities
+or relations in texts that were not seen during training. ZSL has emerged as a
+critical research area due to the scarcity of labeled data in specific domains,
+and its applications have grown significantly in recent years. With the advent
+of large pretrained language models, several novel methods have been proposed,
+resulting in substantial improvements in ZSL performance. There is a growing
+demand, both in the research community and industry, for a comprehensive ZSL
+framework that facilitates the development and accessibility of the latest
+methods and pretrained models.In this study, we propose a novel ZSL framework
+called Zshot that aims to address the aforementioned challenges. Our primary
+objective is to provide a platform that allows researchers to compare different
+state-of-the-art ZSL methods with standard benchmark datasets. Additionally, we
+have designed our framework to support the industry with readily available APIs
+for production under the standard SpaCy NLP pipeline. Our API is extendible and
+evaluable, moreover, we include numerous enhancements such as boosting the
+accuracy with pipeline ensembling and visualization utilities available as a
+SpaCy extension.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Holistic Exploration on Universal Decompositional Semantic Parsing:
+  Architecture, Data Augmentation, and LLM Paradigm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13424v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13424v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hexuan Deng, Xin Zhang, Meishan Zhang, Xuebo Liu, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we conduct a holistic exploration of the Universal
+Decompositional Semantic (UDS) Parsing. We first introduce a cascade model for
+UDS parsing that decomposes the complex parsing task into semantically
+appropriate subtasks. Our approach outperforms the prior models, while
+significantly reducing inference time. We also incorporate syntactic
+information and further optimized the architecture. Besides, different ways for
+data augmentation are explored, which further improve the UDS Parsing. Lastly,
+we conduct experiments to investigate the efficacy of ChatGPT in handling the
+UDS task, revealing that it excels in attribute parsing but struggles in
+relation parsing, and using ChatGPT for data augmentation yields suboptimal
+results. Our code is available at https://github.com/hexuandeng/HExp4UDS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 7 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Resolving Word Ambiguity with Word Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Thurnbauer, Johannes Reisinger, Christoph Goller, Andreas Fischer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ambiguity is ubiquitous in natural language. Resolving ambiguous meanings is
+especially important in information retrieval tasks. While word embeddings
+carry semantic information, they fail to handle ambiguity well. Transformer
+models have been shown to handle word ambiguity for complex queries, but they
+cannot be used to identify ambiguous words, e.g. for a 1-word query.
+Furthermore, training these models is costly in terms of time, hardware
+resources, and training data, prohibiting their use in specialized environments
+with sensitive data. Word embeddings can be trained using moderate hardware
+resources. This paper shows that applying DBSCAN clustering to the latent space
+can identify ambiguous words and evaluate their level of ambiguity. An
+automatic DBSCAN parameter selection leads to high-quality clusters, which are
+semantically coherent and correspond well to the perceived meanings of a given
+word.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Bridging the Digital Language Divide 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13405v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13405v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gábor Bella, Paula Helm, Gertraud Koch, Fausto Giunchiglia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is a well-known fact that current AI-based language technology -- language
+models, machine translation systems, multilingual dictionaries and corpora --
+focuses on the world's 2-3% most widely spoken languages. Recent research
+efforts have attempted to expand the coverage of AI technology to
+`under-resourced languages.' The goal of our paper is to bring attention to a
+phenomenon that we call linguistic bias: multilingual language processing
+systems often exhibit a hardwired, yet usually involuntary and hidden
+representational preference towards certain languages. Linguistic bias is
+manifested in uneven per-language performance even in the case of similar test
+conditions. We show that biased technology is often the result of research and
+development methodologies that do not do justice to the complexity of the
+languages being represented, and that can even become ethically problematic as
+they disregard valuable aspects of diversity as well as the needs of the
+language communities themselves. As our attempt at building diversity-aware
+language resources, we present a new initiative that aims at reducing
+linguistic bias through both technological design and methodology, based on an
+eye-level collaboration with local communities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Embedding Models for Supervised Automatic Extraction and Classification
+  of Named Entities in Scientific Acknowledgements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13377v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13377v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nina Smirnova, Philipp Mayr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Acknowledgments in scientific papers may give an insight into aspects of the
+scientific community, such as reward systems, collaboration patterns, and
+hidden research trends. The aim of the paper is to evaluate the performance of
+different embedding models for the task of automatic extraction and
+classification of acknowledged entities from the acknowledgment text in
+scientific papers. We trained and implemented a named entity recognition (NER)
+task using the Flair NLP framework. The training was conducted using three
+default Flair NER models with four differently-sized corpora and different
+versions of the Flair NLP framework. The Flair Embeddings model trained on the
+medium corpus with the latest FLAIR version showed the best accuracy of 0.79.
+Expanding the size of a training corpus from very small to medium size
+massively increased the accuracy of all training algorithms, but further
+expansion of the training corpus did not bring further improvement. Moreover,
+the performance of the model slightly deteriorated. Our model is able to
+recognize six entity types: funding agency, grant number, individuals,
+university, corporation, and miscellaneous. The model works more precisely for
+some entity types than for others; thus, individuals and grant numbers showed a
+very good F1-Score over 0.9. Most of the previous works on acknowledgment
+analysis were limited by the manual evaluation of data and therefore by the
+amount of processed data. This model can be applied for the comprehensive
+analysis of acknowledgment texts and may potentially make a great contribution
+to the field of automated acknowledgment analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The present paper is an extended version of the article Evaluation of
+  Embedding Models for Automatic Extraction and Classification of Acknowledged
+  Entities in Scientific Documents (Smirnova and Mayr, 2022) presented at the
+  3rd Workshop on Extraction and Evaluation of Knowledge Entities from
+  Scientific Documents (EEKE2022). arXiv admin note: substantial text overlap
+  with arXiv:2206.10939</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empower Your Model with Longer and Better Context Comprehension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifei Gao, Lei Wang, Jun Fang, Longhua Hu, Jun Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, with the emergence of numerous Large Language Models (LLMs), the
+implementation of AI has entered a new era. Irrespective of these models' own
+capacity and structure, there is a growing demand for LLMs to possess enhanced
+comprehension of longer and more complex contexts with relatively smaller
+sizes. Models often encounter an upper limit when processing sequences of
+sentences that extend beyond their comprehension capacity and result in
+off-topic or even chaotic responses. While several recent works attempt to
+address this issue in various ways, they rarely focus on "why models are unable
+to compensate or strengthen their capabilities on their own". In this paper, we
+thoroughly investigate the nature of information transfer within LLMs and
+propose a novel technique called Attention Transition. This technique empowers
+models to achieve longer and better context comprehension with minimal
+additional training or impact on generation fluency. Our experiments are
+conducted in XSum and achieve substantial improvement compared with the
+original generation results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>LLM for long context comprehension</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing Chain-of-Thought <span class="highlight-title">Prompt</span>ing in Large Language Models via
+  Gradient-based Feature Attributions <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Skyler Wu, Eric Meng Shen, Charumathi Badrinath, Jiaqi Ma, Himabindu Lakkaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain-of-thought (CoT) prompting has been shown to empirically improve the
+accuracy of large language models (LLMs) on various question answering tasks.
+While understanding why CoT prompting is effective is crucial to ensuring that
+this phenomenon is a consequence of desired model behavior, little work has
+addressed this; nonetheless, such an understanding is a critical prerequisite
+for responsible model deployment. We address this question by leveraging
+gradient-based feature attribution methods which produce saliency scores that
+capture the influence of input tokens on model output. Specifically, we probe
+several open-source LLMs to investigate whether CoT prompting affects the
+relative importances they assign to particular input tokens. Our results
+indicate that while CoT prompting does not increase the magnitude of saliency
+scores attributed to semantically relevant tokens in the prompt compared to
+standard few-shot prompting, it increases the robustness of saliency scores to
+question perturbations and variations in model output.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Workshop on Challenges in Deployable Generative AI at
+  ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Intent Taxonomy of Legal Case Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunqiu Shao, Haitao Li, Yueyue Wu, Yiqun Liu, Qingyao Ai, Jiaxin Mao, Yixiao Ma, Shaoping Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Legal case retrieval is a special Information Retrieval~(IR) task focusing on
+legal case documents. Depending on the downstream tasks of the retrieved case
+documents, users' information needs in legal case retrieval could be
+significantly different from those in Web search and traditional ad-hoc
+retrieval tasks. While there are several studies that retrieve legal cases
+based on text similarity, the underlying search intents of legal retrieval
+users, as shown in this paper, are more complicated than that yet mostly
+unexplored. To this end, we present a novel hierarchical intent taxonomy of
+legal case retrieval. It consists of five intent types categorized by three
+criteria, i.e., search for Particular Case(s), Characterization, Penalty,
+Procedure, and Interest. The taxonomy was constructed transparently and
+evaluated extensively through interviews, editorial user studies, and query log
+analysis. Through a laboratory user study, we reveal significant differences in
+user behavior and satisfaction under different search intents in legal case
+retrieval. Furthermore, we apply the proposed taxonomy to various downstream
+legal retrieval tasks, e.g., result ranking and satisfaction prediction, and
+demonstrate its effectiveness. Our work provides important insights into the
+understanding of user intents in legal case retrieval and potentially leads to
+better retrieval techniques in the legal domain, such as intent-aware ranking
+strategies and evaluation methodologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, work in process</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LoraHub: Efficient Cross-Task Generalization via Dynamic LoRA
+  Composition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengsong Huang, Qian Liu, Bill Yuchen Lin, Tianyu Pang, Chao Du, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-rank adaptations (LoRA) are often employed to fine-tune large language
+models (LLMs) for new tasks. This paper investigates LoRA composability for
+cross-task generalization and introduces LoraHub, a strategic framework devised
+for the purposive assembly of LoRA modules trained on diverse given tasks, with
+the objective of achieving adaptable performance on unseen tasks. With just a
+few examples from a novel task, LoraHub enables the fluid combination of
+multiple LoRA modules, eradicating the need for human expertise. Notably, the
+composition requires neither additional model parameters nor gradients. Our
+empirical results, derived from the Big-Bench Hard (BBH) benchmark, suggest
+that LoraHub can effectively mimic the performance of in-context learning in
+few-shot scenarios, excluding the necessity of in-context examples alongside
+each inference input. A significant contribution of our research is the
+fostering of a community for LoRA, where users can share their trained LoRA
+modules, thereby facilitating their application to new tasks. We anticipate
+this resource will widen access to and spur advancements in general
+intelligence as well as LLMs in production. Code will be available at
+https://github.com/sail-sg/lorahub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress. The first three authors contributed equally to this
+  work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WebArena: A Realistic Web Environment for Building Autonomous Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyan Zhou, Frank F. Xu, Hao Zhu, Xuhui Zhou, Robert Lo, Abishek Sridhar, Xianyi Cheng, Yonatan Bisk, Daniel Fried, Uri Alon, Graham Neubig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With generative AI advances, the exciting potential for autonomous agents to
+manage daily tasks via natural language commands has emerged. However, cur rent
+agents are primarily created and tested in simplified synthetic environments,
+substantially limiting real-world scenario representation. In this paper, we
+build an environment for agent command and control that is highly realistic and
+reproducible. Specifically, we focus on agents that perform tasks on websites,
+and we create an environment with fully functional websites from four common
+domains: e-commerce, social forum discussions, collaborative software
+development, and content management. Our environment is enriched with tools
+(e.g., a map) and external knowledge bases (e.g., user manuals) to encourage
+human-like task-solving. Building upon our environment, we release a set of
+benchmark tasks focusing on evaluating the functional correctness of task
+completions. The tasks in our benchmark are diverse, long-horizon, and are
+designed to emulate tasks that humans routinely perform on the internet. We
+design and implement several autonomous agents, integrating recent techniques
+such as reasoning before acting. The results demonstrate that solving complex
+tasks is challenging: our best GPT-4-based agent only achieves an end-to-end
+task success rate of 10.59%. These results highlight the need for further
+development of robust agents, that current state-of-the-art LMs are far from
+perfect performance in these real-life tasks, and that WebArena can be used to
+measure such progress. Our code, data, environment reproduction resources, and
+video demonstrations are publicly available at https://webarena.dev/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ARC-NLP at Multimodal Hate Speech Event Detection 2023: Multimodal
+  Methods Boosted by Ensemble Learning, Syntactical and Entity Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13829v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13829v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Umitcan Sahin, Izzet Emre Kucukkaya, Oguzhan Ozcelik, Cagri Toraman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-embedded images can serve as a means of spreading hate speech,
+propaganda, and extremist beliefs. Throughout the Russia-Ukraine war, both
+opposing factions heavily relied on text-embedded images as a vehicle for
+spreading propaganda and hate speech. Ensuring the effective detection of hate
+speech and propaganda is of utmost importance to mitigate the negative effect
+of hate speech dissemination. In this paper, we outline our methodologies for
+two subtasks of Multimodal Hate Speech Event Detection 2023. For the first
+subtask, hate speech detection, we utilize multimodal deep learning models
+boosted by ensemble learning and syntactical text attributes. For the second
+subtask, target detection, we employ multimodal deep learning models boosted by
+named entity features. Through experimentation, we demonstrate the superior
+performance of our models compared to all textual, visual, and text-visual
+baselines employed in multimodal hate speech detection. Furthermore, our models
+achieve the first place in both subtasks on the final leaderboard of the shared
+task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to CASE at RANLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Watermarking Conditional Text Generation for AI Detection: Unveiling
+  Challenges and a Semantic-Aware Watermark Remedy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13808v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13808v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Fu, Deyi Xiong, Yue Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To mitigate potential risks associated with language models, recent AI
+detection research proposes incorporating watermarks into machine-generated
+text through random vocabulary restrictions and utilizing this information for
+detection. While these watermarks only induce a slight deterioration in
+perplexity, our empirical investigation reveals a significant detriment to the
+performance of conditional text generation. To address this issue, we introduce
+a simple yet effective semantic-aware watermarking algorithm that considers the
+characteristics of conditional text generation and the input context.
+Experimental results demonstrate that our proposed method yields substantial
+improvements across various text generation models, including BART and Flan-T5,
+in tasks such as summarization and data-to-text generation while maintaining
+detection ability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is <span class="highlight-title">GPT</span> a Computational Model of Emotion? Detailed Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13779v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13779v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ala N. Tak, Jonathan Gratch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the emotional reasoning abilities of the GPT family
+of large language models via a component perspective. The paper first examines
+how the model reasons about autobiographical memories. Second, it
+systematically varies aspects of situations to impact emotion intensity and
+coping tendencies. Even without the use of prompt engineering, it is shown that
+GPT's predictions align significantly with human-provided appraisals and
+emotional labels. However, GPT faces difficulties predicting emotion intensity
+and coping responses. GPT-4 showed the highest performance in the initial study
+but fell short in the second, despite providing superior results after minor
+prompt engineering. This assessment brings up questions on how to effectively
+employ the strong points and address the weak areas of these models,
+particularly concerning response variability. These studies underscore the
+merits of evaluating models from a componential perspective.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Combating the Curse of Multilinguality in Cross-Lingual WSD by Aligning
+  Sparse Contextualized Word Representations <span class="chip">NAACL2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13776v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13776v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gábor Berend
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we advocate for using large pre-trained monolingual language
+models in cross lingual zero-shot word sense disambiguation (WSD) coupled with
+a contextualized mapping mechanism. We also report rigorous experiments that
+illustrate the effectiveness of employing sparse contextualized word
+representations obtained via a dictionary learning procedure. Our experimental
+results demonstrate that the above modifications yield a significant
+improvement of nearly 6.5 points of increase in the average F-score (from 62.0
+to 68.5) over a collection of 17 typologically diverse set of target languages.
+We release our source code for replicating our experiments at
+https://github.com/begab/sparsity_makes_sense.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at NAACL2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diversity and Language Technology: How Techno-Linguistic Bias Can Cause
+  Epistemic Injustice 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13714v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13714v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paula Helm, Gábor Bella, Gertraud Koch, Fausto Giunchiglia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is well known that AI-based language technology -- large language models,
+machine translation systems, multilingual dictionaries, and corpora -- is
+currently limited to 2 to 3 percent of the world's most widely spoken and/or
+financially and politically best supported languages. In response, recent
+research efforts have sought to extend the reach of AI technology to
+``underserved languages.'' In this paper, we show that many of these attempts
+produce flawed solutions that adhere to a hard-wired representational
+preference for certain languages, which we call techno-linguistic bias.
+Techno-linguistic bias is distinct from the well-established phenomenon of
+linguistic bias as it does not concern the languages represented but rather the
+design of the technologies. As we show through the paper, techno-linguistic
+bias can result in systems that can only express concepts that are part of the
+language and culture of dominant powers, unable to correctly represent concepts
+from other communities. We argue that at the root of this problem lies a
+systematic tendency of technology developer communities to apply a simplistic
+understanding of diversity which does not do justice to the more profound
+differences that languages, and ultimately the communities that speak them,
+embody. Drawing on the concept of epistemic injustice, we point to the broader
+sociopolitical consequences of the bias we identify and show how it can lead
+not only to a disregard for valuable aspects of diversity but also to an
+under-representation of the needs and diverse worldviews of marginalized
+language communities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2307.13405</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QuIP: 2-Bit Quantization of Large Language Models With Guarantees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jerry Chee, Yaohui Cai, Volodymyr Kuleshov, Christopher De Sa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work studies post-training parameter quantization in large language
+models (LLMs). We introduce quantization with incoherence processing (QuIP), a
+new method based on the insight that quantization benefits from incoherent
+weight and Hessian matrices, i.e., from the weights and the directions in which
+it is important to round them accurately being unaligned with the coordinate
+axes. QuIP consists of two steps: (1) an adaptive rounding procedure minimizing
+a quadratic proxy objective; (2) efficient pre- and post-processing that
+ensures weight and Hessian incoherence via multiplication by random orthogonal
+matrices. We complement QuIP with the first theoretical analysis for an
+LLM-scale quantization algorithm, and show that our theory also applies to an
+existing method, OPTQ. Empirically, we find that our incoherence preprocessing
+improves several existing quantization algorithms and yields the first LLM
+quantization methods that produce viable results using only two bits per
+weight. Our code can be found at https://github.com/jerry-chee/QuIP .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Can Large Language Models Help Humans in Design and Manufacturing? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14377v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14377v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liane Makatura, Michael Foshey, Bohan Wang, Felix HähnLein, Pingchuan Ma, Bolei Deng, Megan Tjandrasuwita, Andrew Spielberg, Crystal Elaine Owens, Peter Yichen Chen, Allan Zhao, Amy Zhu, Wil J Norton, Edward Gu, Joshua Jacob, Yifei Li, Adriana Schulz, Wojciech Matusik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of Large Language Models (LLMs), including GPT-4, provides
+exciting new opportunities for generative design. We investigate the
+application of this tool across the entire design and manufacturing workflow.
+Specifically, we scrutinize the utility of LLMs in tasks such as: converting a
+text-based prompt into a design specification, transforming a design into
+manufacturing instructions, producing a design space and design variations,
+computing the performance of a design, and searching for designs predicated on
+performance. Through a series of examples, we highlight both the benefits and
+the limitations of the current LLMs. By exposing these limitations, we aspire
+to catalyze the continued improvement and progression of these models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prot2Text: Multimodal Protein's Function Generation with GNNs and
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14367v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14367v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hadi Abdine, Michail Chatzianastasis, Costas Bouyioukos, Michalis Vazirgiannis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The complex nature of big biological systems pushed some scientists to
+classify its understanding under the inconceivable missions. Different leveled
+challenges complicated this task, one of is the prediction of a protein's
+function. In recent years, significant progress has been made in this field
+through the development of various machine learning approaches. However, most
+existing methods formulate the task as a multi-classification problem, i.e
+assigning predefined labels to proteins. In this work, we propose a novel
+approach, \textbf{Prot2Text}, which predicts a protein function's in a free
+text style, moving beyond the conventional binary or categorical
+classifications. By combining Graph Neural Networks(GNNs) and Large Language
+Models(LLMs), in an encoder-decoder framework, our model effectively integrates
+diverse data types including proteins' sequences, structures, and textual
+annotations. This multimodal approach allows for a holistic representation of
+proteins' functions, enabling the generation of detailed and accurate
+descriptions. To evaluate our model, we extracted a multimodal protein dataset
+from SwissProt, and demonstrate empirically the effectiveness of Prot2Text.
+These results highlight the transformative impact of multimodal models,
+specifically the fusion of GNNs and LLMs, empowering researchers with powerful
+tools for more accurate prediction of proteins' functions. The code, the models
+and a demo will be publicly released.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stabilizing <span class="highlight-title">Transformer</span> Training by Preventing Attention Entropy
+  Collapse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06296v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06296v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuangfei Zhai, Tatiana Likhomanenko, Etai Littwin, Dan Busbridge, Jason Ramapuram, Yizhe Zhang, Jiatao Gu, Josh Susskind
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training stability is of great importance to Transformers. In this work, we
+investigate the training dynamics of Transformers by examining the evolution of
+the attention layers. In particular, we track the attention entropy for each
+attention head during the course of training, which is a proxy for model
+sharpness. We identify a common pattern across different architectures and
+tasks, where low attention entropy is accompanied by high training instability,
+which can take the form of oscillating loss or divergence. We denote the
+pathologically low attention entropy, corresponding to highly concentrated
+attention scores, as $\textit{entropy collapse}$. As a remedy, we propose
+$\sigma$Reparam, a simple and efficient solution where we reparametrize all
+linear layers with spectral normalization and an additional learned scalar. We
+demonstrate that $\sigma$Reparam successfully prevents entropy collapse in the
+attention layers, promoting more stable training. Additionally, we prove a
+tight lower bound of the attention entropy, which decreases exponentially fast
+with the spectral norm of the attention logits, providing additional motivation
+for our approach. We conduct experiments with $\sigma$Reparam on image
+classification, image self-supervised learning, machine translation, speech
+recognition, and language modeling tasks. We show that $\sigma$Reparam provides
+stability and robustness with respect to the choice of hyperparameters, going
+so far as enabling training (a) a Vision Transformer {to competitive
+performance} without warmup, weight decay, layer normalization or adaptive
+optimizers; (b) deep architectures in machine translation and (c) speech
+recognition to competitive performance without warmup and adaptive optimizers.
+Code is available at \url{https://github.com/apple/ml-sigma-reparam}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Soft <span class="highlight-title">Prompt</span> Tuning for Augmenting Dense Retrieval with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08303v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08303v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Peng, Xuyang Wu, Yi Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dense retrieval (DR) converts queries and documents into dense embeddings and
+measures the similarity between queries and documents in vector space. One of
+the challenges in DR is the lack of domain-specific training data. While DR
+models can learn from large-scale public datasets like MS MARCO through
+transfer learning, evidence shows that not all DR models and domains can
+benefit from transfer learning equally. Recently, some researchers have
+resorted to large language models (LLMs) to improve the zero-shot and few-shot
+DR models. However, the hard prompts or human-written prompts utilized in these
+works cannot guarantee the good quality of generated weak queries. To tackle
+this, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,
+we leverage soft prompt-tuning to optimize a task-specific soft prompt on
+limited ground truth data and then prompt the LLMs to tag unlabeled documents
+with weak queries, yielding enough weak document-query pairs to train
+task-specific dense retrievers. We design a filter to select high-quality
+example document-query pairs in the prompt to further improve the quality of
+weak tagged queries. To the best of our knowledge, there is no prior work
+utilizing soft prompt tuning to augment DR models. The experiments demonstrate
+that SPTAR outperforms the unsupervised baselines BM25 and the recently
+proposed LLMs-based augmentation method for DR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>fix typo InPairs which should be InPars</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Meta-Referential Games to Learn Compositional Learning Behaviours 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.08012v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.08012v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Denamganaï, Sondess Missaoui, James Alfred Walker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human beings use compositionality to generalise from past experiences to
+novel experiences. We assume a separation of our experiences into fundamental
+atomic components that can be recombined in novel ways to support our ability
+to engage with novel experiences. We frame this as the ability to learn to
+generalise compositionally, and we will refer to behaviours making use of this
+ability as compositional learning behaviours (CLBs). A central problem to
+learning CLBs is the resolution of a binding problem (BP). While it is another
+feat of intelligence that human beings perform with ease, it is not the case
+for state-of-the-art artificial agents. Thus, in order to build artificial
+agents able to collaborate with human beings, we propose to develop a novel
+benchmark to investigate agents' abilities to exhibit CLBs by solving a
+domain-agnostic version of the BP. We take inspiration from the language
+emergence and grounding framework of referential games and propose a
+meta-learning extension of referential games, entitled Meta-Referential Games,
+and use this framework to build our benchmark, that we name Symbolic Behaviour
+Benchmark (S2B). We provide baseline results showing that our benchmark is a
+compelling challenge that we hope will spur the research community towards
+developing more capable artificial agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DataComp: In search of the next generation of multimodal <span class="highlight-title">dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14108v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14108v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samir Yitzhak Gadre, Gabriel Ilharco, Alex Fang, Jonathan Hayase, Georgios Smyrnis, Thao Nguyen, Ryan Marten, Mitchell Wortsman, Dhruba Ghosh, Jieyu Zhang, Eyal Orgad, Rahim Entezari, Giannis Daras, Sarah Pratt, Vivek Ramanujan, Yonatan Bitton, Kalyani Marathe, Stephen Mussmann, Richard Vencu, Mehdi Cherti, Ranjay Krishna, Pang Wei Koh, Olga Saukh, Alexander Ratner, Shuran Song, Hannaneh Hajishirzi, Ali Farhadi, Romain Beaumont, Sewoong Oh, Alex Dimakis, Jenia Jitsev, Yair Carmon, Vaishaal Shankar, Ludwig Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal datasets are a critical component in recent breakthroughs such as
+Stable Diffusion and GPT-4, yet their design does not receive the same research
+attention as model architectures or training algorithms. To address this
+shortcoming in the ML ecosystem, we introduce DataComp, a testbed for dataset
+experiments centered around a new candidate pool of 12.8 billion image-text
+pairs from Common Crawl. Participants in our benchmark design new filtering
+techniques or curate new data sources and then evaluate their new dataset by
+running our standardized CLIP training code and testing the resulting model on
+38 downstream test sets. Our benchmark consists of multiple compute scales
+spanning four orders of magnitude, which enables the study of scaling trends
+and makes the benchmark accessible to researchers with varying resources. Our
+baseline experiments show that the DataComp workflow leads to better training
+sets. In particular, our best baseline, DataComp-1B, enables training a CLIP
+ViT-L/14 from scratch to 79.2% zero-shot accuracy on ImageNet, outperforming
+OpenAI's CLIP ViT-L/14 by 3.7 percentage points while using the same training
+procedure and compute. We release DataComp and all accompanying code at
+www.datacomp.ai.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revision <span class="highlight-title">Transformer</span>s: Instructing Language Models to Change their
+  Values 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.10332v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.10332v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Friedrich, Wolfgang Stammer, Patrick Schramowski, Kristian Kersting
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current transformer language models (LM) are large-scale models with billions
+of parameters. They have been shown to provide high performances on a variety
+of tasks but are also prone to shortcut learning and bias. Addressing such
+incorrect model behavior via parameter adjustments is very costly. This is
+particularly problematic for updating dynamic concepts, such as moral values,
+which vary culturally or interpersonally. In this work, we question the current
+common practice of storing all information in the model parameters and propose
+the Revision Transformer (RiT) to facilitate easy model updating. The specific
+combination of a large-scale pre-trained LM that inherently but also diffusely
+encodes world knowledge with a clear-structured revision engine makes it
+possible to update the model's knowledge with little effort and the help of
+user interaction. We exemplify RiT on a moral dataset and simulate user
+feedback demonstrating strong performance in model revision even with small
+data. This way, users can easily design a model regarding their preferences,
+paving the way for more transparent AI models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpretable Word Sense Representations via Definition Generation: The
+  Case of Semantic Change Analysis <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11993v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11993v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mario Giulianelli, Iris Luden, Raquel Fernandez, Andrey Kutuzov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose using automatically generated natural language definitions of
+contextualised word usages as interpretable word and word sense
+representations. Given a collection of usage examples for a target word, and
+the corresponding data-driven usage clusters (i.e., word senses), a definition
+is generated for each usage with a specialised Flan-T5 language model, and the
+most prototypical definition in a usage cluster is chosen as the sense label.
+  We demonstrate how the resulting sense labels can make existing approaches to
+semantic change analysis more interpretable, and how they can allow users --
+historical linguists, lexicographers, or social scientists -- to explore and
+intuitively explain diachronic trajectories of word meaning. Semantic change
+analysis is only one of many possible applications of the `definitions as
+representations' paradigm. Beyond being human-readable, contextualised
+definitions also outperform token or usage sentence embeddings in
+word-in-context semantic similarity judgements, making them a new promising
+type of lexical representation for NLP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative User-Experience Research for Developing Domain-specific
+  Natural Language Processing Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16143v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16143v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasia Zhukova, Lukas von Sperl, Christian E. Matt, Bela Gipp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  User experience (UX) is a part of human-computer interaction (HCI) research
+and focuses on increasing intuitiveness, transparency, simplicity, and trust
+for system users. Most of the UX research for machine learning (ML) or natural
+language processing (NLP) focuses on a data-driven methodology, i.e., it fails
+to focus on users' requirements, and engages domain users mainly for usability
+evaluation. Moreover, more typical UX methods tailor the systems towards user
+usability, unlike learning about the user needs first. The paper proposes a
+methodology for integrating generative UX research into developing domain NLP
+applications. Generative UX research employs domain users at the initial stages
+of prototype development, i.e., ideation and concept evaluation, and the last
+stage for evaluating the change in user value. In the case study, we report the
+full-cycle prototype development of a domain-specific semantic search for daily
+operations in the process industry. Our case study shows that involving domain
+experts increases their interest and trust in the final NLP application.
+Moreover, we show that synergetic UX+NLP research efficiently considers data-
+and user-driven opportunities and constraints, which can be crucial for NLP
+applications in narrow domains
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Corrections of Zipf's and Heaps' Laws Derived from Hapax Rate Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12896v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12896v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Łukasz Dębowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The article introduces corrections to Zipf's and Heaps' laws based on
+systematic models of the hapax rate. The derivation rests on two assumptions:
+The first one is the standard urn model which predicts that marginal frequency
+distributions for shorter texts look as if word tokens were sampled blindly
+from a given longer text. The second assumption posits that the rate of hapaxes
+is a simple function of the text size. Four such functions are discussed: the
+constant model, the Davis model, the linear model, and the logistic model. It
+is shown that the logistic model yields the best fit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages, 7 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retentive Network: A Successor to <span class="highlight-title">Transformer</span> for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08621v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08621v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutao Sun, Li Dong, Shaohan Huang, Shuming Ma, Yuqing Xia, Jilong Xue, Jianyong Wang, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose Retentive Network (RetNet) as a foundation
+architecture for large language models, simultaneously achieving training
+parallelism, low-cost inference, and good performance. We theoretically derive
+the connection between recurrence and attention. Then we propose the retention
+mechanism for sequence modeling, which supports three computation paradigms,
+i.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel
+representation allows for training parallelism. The recurrent representation
+enables low-cost $O(1)$ inference, which improves decoding throughput, latency,
+and GPU memory without sacrificing performance. The chunkwise recurrent
+representation facilitates efficient long-sequence modeling with linear
+complexity, where each chunk is encoded parallelly while recurrently
+summarizing the chunks. Experimental results on language modeling show that
+RetNet achieves favorable scaling results, parallel training, low-cost
+deployment, and efficient inference. The intriguing properties make RetNet a
+strong successor to Transformer for large language models. Code will be
+available at https://aka.ms/retnet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RRAML: Reinforced Retrieval Augmented Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12798v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12798v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Bacciu, Florin Cocunasu, Federico Siciliano, Fabrizio Silvestri, Nicola Tonellotto, Giovanni Trappolini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large language models (LLMs) has revolutionized machine
+learning and related fields, showcasing remarkable abilities in comprehending,
+generating, and manipulating human language. However, their conventional usage
+through API-based text prompt submissions imposes certain limitations in terms
+of context constraints and external source availability. To address these
+challenges, we propose a novel framework called Reinforced Retrieval Augmented
+Machine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs
+with supporting information retrieved by a purpose-built retriever from a vast
+user-provided database. By leveraging recent advancements in reinforcement
+learning, our method effectively addresses several critical challenges.
+Firstly, it circumvents the need for accessing LLM gradients. Secondly, our
+method alleviates the burden of retraining LLMs for specific tasks, as it is
+often impractical or impossible due to restricted access to the model and the
+computational intensity involved. Additionally we seamlessly link the
+retriever's task with the reasoner, mitigating hallucinations and reducing
+irrelevant, and potentially damaging retrieved documents. We believe that the
+research agenda outlined in this paper has the potential to profoundly impact
+the field of AI, democratizing access to and utilization of LLMs for a wide
+range of entities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Question Decomposition Improves the Faithfulness of Model-Generated
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11768v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11768v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ansh Radhakrishnan, Karina Nguyen, Anna Chen, Carol Chen, Carson Denison, Danny Hernandez, Esin Durmus, Evan Hubinger, Jackson Kernion, Kamilė Lukošiūtė, Newton Cheng, Nicholas Joseph, Nicholas Schiefer, Oliver Rausch, Sam McCandlish, Sheer El Showk, Tamera Lanham, Tim Maxwell, Venkatesa Chandrasekaran, Zac Hatfield-Dodds, Jared Kaplan, Jan Brauner, Samuel R. Bowman, Ethan Perez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models (LLMs) perform more difficult tasks, it becomes
+harder to verify the correctness and safety of their behavior. One approach to
+help with this issue is to prompt LLMs to externalize their reasoning, e.g., by
+having them generate step-by-step reasoning as they answer a question
+(Chain-of-Thought; CoT). The reasoning may enable us to check the process that
+models use to perform tasks. However, this approach relies on the stated
+reasoning faithfully reflecting the model's actual reasoning, which is not
+always the case. To improve over the faithfulness of CoT reasoning, we have
+models generate reasoning by decomposing questions into subquestions.
+Decomposition-based methods achieve strong performance on question-answering
+tasks, sometimes approaching that of CoT while improving the faithfulness of
+the model's stated reasoning on several recently-proposed metrics. By forcing
+the model to answer simpler subquestions in separate contexts, we greatly
+increase the faithfulness of model-generated reasoning over CoT, while still
+achieving some of the performance gains of CoT. Our results show it is possible
+to improve the faithfulness of model-generated reasoning; continued
+improvements may lead to reasoning that enables us to verify the correctness
+and safety of LLM behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>For few-shot examples and prompts, see
+  https://github.com/anthropics/DecompositionFaithfulnessPaper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Emotion<span class="highlight-title">Prompt</span>: Leveraging Psychology for Large Language Models
+  Enhancement via Emotional Stimulus 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11760v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11760v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Li, Jindong Wang, Kaijie Zhu, Yixuan Zhang, Wenxin Hou, Jianxun Lian, Xing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved significant performance in many
+fields such as reasoning, language understanding, and math problem-solving, and
+are regarded as a crucial step to artificial general intelligence (AGI).
+However, the sensitivity of LLMs to prompts remains a major bottleneck for
+their daily adoption. In this paper, we take inspiration from psychology and
+propose EmotionPrompt to explore emotional intelligence to enhance the
+performance of LLMs. EmotionPrompt operates on a remarkably straightforward
+principle: the incorporation of emotional stimulus into prompts. Experimental
+results demonstrate that our EmotionPrompt, using the same single prompt
+templates, significantly outperforms original zero-shot prompt and
+Zero-shot-CoT on 8 tasks with diverse models: ChatGPT, Vicuna-13b, Bloom, and
+T5. Further, EmotionPrompt was observed to improve both truthfulness and
+informativeness. We believe that EmotionPrompt heralds a novel avenue for
+exploring interdisciplinary knowledge for humans-LLMs interaction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress; 9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Concept Algebra for Score-Based Conditional Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.03693v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.03693v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Wang, Lin Gui, Jeffrey Negrea, Victor Veitch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper concerns the structure of learned representations in text-guided
+generative models, focusing on score-based models. Here, we focus on the idea
+that concepts are encoded as subspaces (or directions) of some representation
+space. We develop a mathematical formalization of this idea.Using this
+formalism, we show there's a natural choice of representation with this
+property, and we develop a simple method for identifying the part of the
+representation corresponding to a given concept. In particular, this allows us
+to manipulate the concepts expressed by the model through algebraic
+manipulation of the representation. We demonstrate the idea with examples
+text-guided image generation, using Stable Diffusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantifying & Modeling Multimodal Interactions: An Information
+  Decomposition Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12247v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12247v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Pu Liang, Yun Cheng, Xiang Fan, Chun Kai Ling, Suzanne Nie, Richard Chen, Zihao Deng, Nicholas Allen, Randy Auerbach, Faisal Mahmood, Ruslan Salakhutdinov, Louis-Philippe Morency
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent explosion of interest in multimodal applications has resulted in a
+wide selection of datasets and methods for representing and integrating
+information from different modalities. Despite these empirical advances, there
+remain fundamental research questions: How can we quantify the interactions
+that are necessary to solve a multimodal task? Subsequently, what are the most
+suitable multimodal models to capture these interactions? To answer these
+questions, we propose an information-theoretic approach to quantify the degree
+of redundancy, uniqueness, and synergy relating input modalities with an output
+task. We term these three measures as the PID statistics of a multimodal
+distribution (or PID for short), and introduce two new estimators for these PID
+statistics that scale to high-dimensional distributions. To validate PID
+estimation, we conduct extensive experiments on both synthetic datasets where
+the PID is known and on large-scale multimodal benchmarks where PID estimations
+are compared with human annotations. Finally, we demonstrate their usefulness
+in (1) quantifying interactions within multimodal datasets, (2) quantifying
+interactions captured by multimodal models, (3) principled approaches for model
+selection, and (4) three real-world case studies engaging with domain experts
+in pathology, mood prediction, and robotic perception where our framework helps
+to recommend strong multimodal models for each application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at: https://github.com/pliang279/PID</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Direct Speech Translation for Automatic Subtitling <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.13192v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.13192v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Papi, Marco Gaido, Alina Karakanta, Mauro Cettolo, Matteo Negri, Marco Turchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic subtitling is the task of automatically translating the speech of
+audiovisual content into short pieces of timed text, i.e. subtitles and their
+corresponding timestamps. The generated subtitles need to conform to space and
+time requirements, while being synchronised with the speech and segmented in a
+way that facilitates comprehension. Given its considerable complexity, the task
+has so far been addressed through a pipeline of components that separately deal
+with transcribing, translating, and segmenting text into subtitles, as well as
+predicting timestamps. In this paper, we propose the first direct ST model for
+automatic subtitling that generates subtitles in the target language along with
+their timestamps with a single model. Our experiments on 7 language pairs show
+that our approach outperforms a cascade system in the same data condition, also
+being competitive with production tools on both in-domain and newly-released
+out-domain benchmarks covering new scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at TACL</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">100</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmarking and Analyzing Generative Data for Visual Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13697v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13697v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Li, Haotian Liu, Liangyu Chen, Yong Jae Lee, Chunyuan Li, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in large pre-trained generative models have expanded their
+potential as effective data generators in visual recognition. This work delves
+into the impact of generative images, primarily comparing paradigms that
+harness external data (\ie generative \vs retrieval \vs original).
+  Our key contributions are: \textbf{1) GenBench Construction:} We devise
+\textbf{GenBench}, a broad benchmark comprising 22 datasets with 2548
+categories, to appraise generative data across various visual recognition
+tasks. \textbf{2) CLER Score:} To address the insufficient correlation of
+existing metrics (\eg, FID, CLIP score) with downstream recognition
+performance, we propose \textbf{CLER}, a training-free metric indicating
+generative data's efficiency for recognition tasks prior to training.
+\textbf{3) New Baselines:} Comparisons of generative data with retrieved data
+from the same external pool help to elucidate the unique traits of generative
+data. \textbf{4) External Knowledge Injection:} By fine-tuning special token
+embeddings for each category via Textual Inversion, performance improves across
+17 datasets, except when dealing with low-resolution reference images.
+  Our exhaustive benchmark and analysis spotlight generative data's promise in
+visual recognition, while identifying key challenges for future investigation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Research Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Visual Language of Fabrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13681v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13681v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valentin Deschaintre, Julia Guerrero-Viu, Diego Gutierrez, Tamy Boubekeur, Belen Masia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce text2fabric, a novel dataset that links free-text descriptions
+to various fabric materials. The dataset comprises 15,000 natural language
+descriptions associated to 3,000 corresponding images of fabric materials.
+Traditionally, material descriptions come in the form of tags/keywords, which
+limits their expressivity, induces pre-existing knowledge of the appropriate
+vocabulary, and ultimately leads to a chopped description system. Therefore, we
+study the use of free-text as a more appropriate way to describe material
+appearance, taking the use case of fabrics as a common item that non-experts
+may often deal with. Based on the analysis of the dataset, we identify a
+compact lexicon, set of attributes and key structure that emerge from the
+descriptions. This allows us to accurately understand how people describe
+fabrics and draw directions for generalization to other types of materials. We
+also show that our dataset enables specializing large vision-language models
+such as CLIP, creating a meaningful latent space for fabric appearance, and
+significantly improving applications such as fine-grained material retrieval
+and automatic captioning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Personal Protective Equipment Detection in Extreme Construction
+  Conditions <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuexiong Ding, Xiaowei Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object detection has been widely applied for construction safety management,
+especially personal protective equipment (PPE) detection. Though the existing
+PPE detection models trained on conventional datasets have achieved excellent
+results, their performance dramatically declines in extreme construction
+conditions. A robust detection model NST-YOLOv5 is developed by combining the
+neural style transfer (NST) and YOLOv5 technologies. Five extreme conditions
+are considered and simulated via the NST module to endow the detection model
+with excellent robustness, including low light, intense light, sand dust, fog,
+and rain. Experiments show that the NST has great potential as a tool for
+extreme data synthesis since it is better at simulating extreme conditions than
+other traditional image processing algorithms and helps the NST-YOLOv5 achieve
+0.141 and 0.083 mAP_(05:95) improvements in synthesized and real-world extreme
+data. This study provides a new feasible way to obtain a more robust detection
+model for extreme construction conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2023 ASCE International Conference on Computing in Civil Engineering
+  (i3CE 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QuickQual: Lightweight, convenient retinal image quality scoring with
+  off-the-shelf <span class="highlight-title">pretrain</span>ed models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13646v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13646v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Justin Engelmann, Amos Storkey, Miguel O. Bernabeu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image quality remains a key problem for both traditional and deep learning
+(DL)-based approaches to retinal image analysis, but identifying poor quality
+images can be time consuming and subjective. Thus, automated methods for
+retinal image quality scoring (RIQS) are needed. The current state-of-the-art
+is MCFNet, composed of three Densenet121 backbones each operating in a
+different colour space. MCFNet, and the EyeQ dataset released by the same
+authors, was a huge step forward for RIQS. We present QuickQual, a simple
+approach to RIQS, consisting of a single off-the-shelf ImageNet-pretrained
+Densenet121 backbone plus a Support Vector Machine (SVM). QuickQual performs
+very well, setting a new state-of-the-art for EyeQ (Accuracy: 88.50% vs 88.00%
+for MCFNet; AUC: 0.9687 vs 0.9588). This suggests that RIQS can be solved with
+generic perceptual features learned on natural images, as opposed to requiring
+DL models trained on large amounts of fundus images. Additionally, we propose a
+Fixed Prior linearisation scheme, that converts EyeQ from a 3-way
+classification to a continuous logistic regression task. For this task, we
+present a second model, QuickQual MEga Minified Estimator (QuickQual-MEME),
+that consists of only 10 parameters on top of an off-the-shelf Densenet121 and
+can distinguish between gradable and ungradable images with an accuracy of
+89.18% (AUC: 0.9537). Code and model are available on GitHub:
+https://github.com/justinengelmann/QuickQual . QuickQual is so lightweight,
+that the entire inference code (and even the parameters for QuickQual-MEME) is
+already contained in this paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Transferable Object-Centric Diffeomorphic Transformations for
+  Data Augmentation in Medical Image Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13645v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13645v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nilesh Kumar, Prashnna K. Gyawali, Sandesh Ghimire, Linwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obtaining labelled data in medical image segmentation is challenging due to
+the need for pixel-level annotations by experts. Recent works have shown that
+augmenting the object of interest with deformable transformations can help
+mitigate this challenge. However, these transformations have been learned
+globally for the image, limiting their transferability across datasets or
+applicability in problems where image alignment is difficult. While
+object-centric augmentations provide a great opportunity to overcome these
+issues, existing works are only focused on position and random transformations
+without considering shape variations of the objects. To this end, we propose a
+novel object-centric data augmentation model that is able to learn the shape
+variations for the objects of interest and augment the object in place without
+modifying the rest of the image. We demonstrated its effectiveness in improving
+kidney tumour segmentation when leveraging shape variations learned both from
+within the same dataset and transferred from external datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optical Flow boosts Unsupervised Localization and Segmentation <span class="chip">IROS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13640v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13640v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Zhang, Abdeslam Boularias
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised localization and segmentation are long-standing robot vision
+challenges that describe the critical ability for an autonomous robot to learn
+to decompose images into individual objects without labeled data. These tasks
+are important because of the limited availability of dense image manual
+annotation and the promising vision of adapting to an evolving set of object
+categories in lifelong learning. Most recent methods focus on using visual
+appearance continuity as object cues by spatially clustering features obtained
+from self-supervised vision transformers (ViT). In this work, we leverage
+motion cues, inspired by the common fate principle that pixels that share
+similar movements tend to belong to the same object. We propose a new loss term
+formulation that uses optical flow in unlabeled videos to encourage
+self-supervised ViT features to become closer to each other if their
+corresponding spatial locations share similar movements, and vice versa. We use
+the proposed loss function to finetune vision transformers that were originally
+trained on static images. Our fine-tuning procedure outperforms
+state-of-the-art techniques for unsupervised semantic segmentation through
+linear probing, without the use of any labeled data. This procedure also
+demonstrates increased performance over original ViT networks across
+unsupervised object localization and semantic segmentation benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IROS2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fake It Without Making It: Conditioned Face Generation for Accurate 3D
+  Face Shape Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13639v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13639v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Will Rowan, Patrik Huber, Nick Pears, Andrew Keeling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate 3D face shape estimation is an enabling technology with applications
+in healthcare, security, and creative industries, yet current state-of-the-art
+methods either rely on self-supervised training with 2D image data or
+supervised training with very limited 3D data. To bridge this gap, we present a
+novel approach which uses a conditioned stable diffusion model for face image
+generation, leveraging the abundance of 2D facial information to inform 3D
+space. By conditioning stable diffusion on depth maps sampled from a 3D
+Morphable Model (3DMM) of the human face, we generate diverse and
+shape-consistent images, forming the basis of SynthFace. We introduce this
+large-scale synthesised dataset of 250K photorealistic images and corresponding
+3DMM parameters. We further propose ControlFace, a deep neural network, trained
+on SynthFace, which achieves competitive performance on the NoW benchmark,
+without requiring 3D supervision or manual 3D asset creation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RecursiveDet: End-to-End Region-based Recursive Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13619v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13619v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Zhao, Li Sun, Qingli Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end region-based object detectors like Sparse R-CNN usually have
+multiple cascade bounding box decoding stages, which refine the current
+predictions according to their previous results. Model parameters within each
+stage are independent, evolving a huge cost. In this paper, we find the general
+setting of decoding stages is actually redundant. By simply sharing parameters
+and making a recursive decoder, the detector already obtains a significant
+improvement. The recursive decoder can be further enhanced by positional
+encoding (PE) of the proposal box, which makes it aware of the exact locations
+and sizes of input bounding boxes, thus becoming adaptive to proposals from
+different stages during the recursion. Moreover, we also design
+centerness-based PE to distinguish the RoI feature element and dynamic
+convolution kernels at different positions within the bounding box. To validate
+the effectiveness of the proposed method, we conduct intensive ablations and
+build the full model on three recent mainstream region-based detectors. The
+RecusiveDet is able to achieve obvious performance boosts with even fewer model
+parameters and slightly increased computation cost. Codes are available at
+https://github.com/bravezzzzzz/RecursiveDet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Object-based Probabilistic Similarity Evidence of Sparse Latent Features
+  from Fully Convolutional Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13606v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13606v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cyril Juliani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Similarity analysis using neural networks has emerged as a powerful technique
+for understanding and categorizing complex patterns in various domains. By
+leveraging the latent representations learned by neural networks, data objects
+such as images can be compared effectively. This research explores the
+utilization of latent information generated by fully convolutional networks
+(FCNs) in similarity analysis, notably to estimate the visual resemblance of
+objects segmented in 2D pictures. To do this, the analytical scheme comprises
+two steps: (1) extracting and transforming feature patterns per 2D object from
+a trained FCN, and (2) identifying the most similar patterns through fuzzy
+inference. The step (2) can be further enhanced by incorporating a weighting
+scheme that considers the significance of latent variables in the analysis. The
+results provide valuable insights into the benefits and challenges of employing
+neural network-based similarity analysis for discerning data patterns
+effectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decisive Data using Multi-Modality Optical Sensors for Advanced
+  Vehicular Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13600v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13600v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Ali Farooq, Waseem Shariff, Mehdi Sefidgar Dilmaghani, Wang Yao, Moazam Soomro, Peter Corcoran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical sensors have played a pivotal role in acquiring real world data for
+critical applications. This data, when integrated with advanced machine
+learning algorithms provides meaningful information thus enhancing human
+vision. This paper focuses on various optical technologies for design and
+development of state-of-the-art out-cabin forward vision systems and in-cabin
+driver monitoring systems. The focused optical sensors include Longwave Thermal
+Imaging (LWIR) cameras, Near Infrared (NIR), Neuromorphic/ event cameras,
+Visible CMOS cameras and Depth cameras. Further the paper discusses different
+potential applications which can be employed using the unique strengths of each
+these optical modalities in real time environment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Paper is accepted in 25th Irish Machine Vision and Image
+  Processing Conference (IMVIP23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mystique: Deconstructing SVG Charts for Layout Reuse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13567v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13567v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Chen, Bongshin Lee, Yunhai Wang, Yunjeong Chang, Zhicheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To facilitate the reuse of existing charts, previous research has examined
+how to obtain a semantic understanding of a chart by deconstructing its visual
+representation into reusable components, such as encodings. However, existing
+deconstruction approaches primarily focus on chart styles, handling only basic
+layouts. In this paper, we investigate how to deconstruct chart layouts,
+focusing on rectangle-based ones as they cover not only 17 chart types but also
+advanced layouts (e.g., small multiples, nested layouts). We develop an
+interactive tool, called Mystique, adopting a mixed-initiative approach to
+extract the axes and legend, and deconstruct a chart's layout into four
+semantic components: mark groups, spatial relationships, data encodings, and
+graphical constraints. Mystique employs a wizard interface that guides chart
+authors through a series of steps to specify how the deconstructed components
+map to their own data. On 150 rectangle-based SVG charts, Mystique achieves
+above 85% accuracy for axis and legend extraction and 96% accuracy for layout
+deconstruction. In a chart reproduction study, participants could easily reuse
+existing charts on new datasets. We discuss the current limitations of Mystique
+and future research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at the 2023 IEEE Visualization Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Group Activity Recognition in Computer Vision: A Comprehensive <span class="highlight-title">Review</span>,
+  Challenges, and Future Perspectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanchuan Wang, Ahmad Sufril Azlan Mohamed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Group activity recognition is a hot topic in computer vision. Recognizing
+activities through group relationships plays a vital role in group activity
+recognition. It holds practical implications in various scenarios, such as
+video analysis, surveillance, automatic driving, and understanding social
+activities. The model's key capabilities encompass efficiently modeling
+hierarchical relationships within a scene and accurately extracting distinctive
+spatiotemporal features from groups. Given this technology's extensive
+applicability, identifying group activities has garnered significant research
+attention. This work examines the current progress in technology for
+recognizing group activities, with a specific focus on global interactivity and
+activities. Firstly, we comprehensively review the pertinent literature and
+various group activity recognition approaches, from traditional methodologies
+to the latest methods based on spatial structure, descriptors, non-deep
+learning, hierarchical recurrent neural networks (HRNN), relationship models,
+and attention mechanisms. Subsequently, we present the relational network and
+relational architectures for each module. Thirdly, we investigate methods for
+recognizing group activity and compare their performance with state-of-the-art
+technologies. We summarize the existing challenges and provide comprehensive
+guidance for newcomers to understand group activity recognition. Furthermore,
+we review emerging perspectives in group activity recognition to explore new
+directions and possibilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model Calibration in Dense Classification with Adaptive Label
+  Perturbation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13539v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13539v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Liu, Changkun Ye, Shan Wang, Ruikai Cui, Jing Zhang, Kaihao Zhang, Nick Barnes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For safety-related applications, it is crucial to produce trustworthy deep
+neural networks whose prediction is associated with confidence that can
+represent the likelihood of correctness for subsequent decision-making.
+Existing dense binary classification models are prone to being over-confident.
+To improve model calibration, we propose Adaptive Stochastic Label Perturbation
+(ASLP) which learns a unique label perturbation level for each training image.
+ASLP employs our proposed Self-Calibrating Binary Cross Entropy (SC-BCE) loss,
+which unifies label perturbation processes including stochastic approaches
+(like DisturbLabel), and label smoothing, to correct calibration while
+maintaining classification rates. ASLP follows Maximum Entropy Inference of
+classic statistical mechanics to maximise prediction entropy with respect to
+missing information. It performs this while: (1) preserving classification
+accuracy on known data as a conservative solution, or (2) specifically improves
+model calibration degree by minimising the gap between the prediction accuracy
+and expected confidence of the target training label. Extensive results
+demonstrate that ASLP can significantly improve calibration degrees of dense
+binary classification models on both in-distribution and out-of-distribution
+data. The code is available on https://github.com/Carlisle-Liu/ASLP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spectrum-guided Multi-granularity Referring Video Object Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13537v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13537v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Miao, Mohammed Bennamoun, Yongsheng Gao, Ajmal Mian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current referring video object segmentation (R-VOS) techniques extract
+conditional kernels from encoded (low-resolution) vision-language features to
+segment the decoded high-resolution features. We discovered that this causes
+significant feature drift, which the segmentation kernels struggle to perceive
+during the forward computation. This negatively affects the ability of
+segmentation kernels. To address the drift problem, we propose a
+Spectrum-guided Multi-granularity (SgMg) approach, which performs direct
+segmentation on the encoded features and employs visual details to further
+optimize the masks. In addition, we propose Spectrum-guided Cross-modal Fusion
+(SCF) to perform intra-frame global interactions in the spectral domain for
+effective multimodal representation. Finally, we extend SgMg to perform
+multi-object R-VOS, a new paradigm that enables simultaneous segmentation of
+multiple referred objects in a video. This not only makes R-VOS faster, but
+also more practical. Extensive experiments show that SgMg achieves
+state-of-the-art performance on four video benchmark datasets, outperforming
+the nearest competitor by 2.8% points on Ref-YouTube-VOS. Our extended SgMg
+enables multi-object R-VOS, runs about 3 times faster while maintaining
+satisfactory performance. Code is available at https://github.com/bo-miao/SgMg.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023, code is at https://github.com/bo-miao/SgMg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Re-mine, Learn and Reason: Exploring the Cross-modal Semantic
+  Correlations for Language-guided HOI detection <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichao Cao, Xiu Su, Qingfei Tang, Feng Yang, Shan You, Xiaobo Lu, Chang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-Object Interaction (HOI) detection is a challenging computer vision
+task that requires visual models to address the complex interactive
+relationship between humans and objects and predict HOI triplets. Despite the
+challenges posed by the numerous interaction combinations, they also offer
+opportunities for multimodal learning of visual texts. In this paper, we
+present a systematic and unified framework (RmLR) that enhances HOI detection
+by incorporating structured text knowledge. Firstly, we qualitatively and
+quantitatively analyze the loss of interaction information in the two-stage HOI
+detector and propose a re-mining strategy to generate more comprehensive visual
+representation.Secondly, we design more fine-grained sentence- and word-level
+alignment and knowledge transfer strategies to effectively address the
+many-to-many matching problem between multiple interactions and multiple
+texts.These strategies alleviate the matching confusion problem that arises
+when multiple interactions occur simultaneously, thereby improving the
+effectiveness of the alignment process. Finally, HOI reasoning by visual
+features augmented with textual knowledge substantially improves the
+understanding of interactions. Experimental results illustrate the
+effectiveness of our approach, where state-of-the-art performance is achieved
+on public benchmarks. We further analyze the effects of different components of
+our approach to provide insights into its efficacy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Not with my name! Inferring artists' names of input strings employed by
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13527v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13527v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roberto Leotta, Oliver Giudice, Luca Guarnera, Sebastiano Battiato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion Models (DM) are highly effective at generating realistic,
+high-quality images. However, these models lack creativity and merely compose
+outputs based on their training data, guided by a textual input provided at
+creation time. Is it acceptable to generate images reminiscent of an artist,
+employing his name as input? This imply that if the DM is able to replicate an
+artist's work then it was trained on some or all of his artworks thus violating
+copyright. In this paper, a preliminary study to infer the probability of use
+of an artist's name in the input string of a generated image is presented. To
+this aim we focused only on images generated by the famous DALL-E 2 and
+collected images (both original and generated) of five renowned artists.
+Finally, a dedicated Siamese Neural Network was employed to have a first kind
+of probability. Experimental results demonstrate that our approach is an
+optimal starting point and can be employed as a prior for predicting a complete
+input string of an investigated image. Dataset and code are available at:
+https://github.com/ictlab-unict/not-with-my-name .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HeightFormer: Explicit Height Modeling without Extra Data for
+  Camera-only 3D Object Detection in Bird's Eye View 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Wu, Ruixiang Li, Zequn Qin, Xinhai Zhao, Xi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-based Bird's Eye View (BEV) representation is an emerging perception
+formulation for autonomous driving. The core challenge is to construct BEV
+space with multi-camera features, which is a one-to-many ill-posed problem.
+Diving into all previous BEV representation generation methods, we found that
+most of them fall into two types: modeling depths in image views or modeling
+heights in the BEV space, mostly in an implicit way. In this work, we propose
+to explicitly model heights in the BEV space, which needs no extra data like
+LiDAR and can fit arbitrary camera rigs and types compared to modeling depths.
+Theoretically, we give proof of the equivalence between height-based methods
+and depth-based methods. Considering the equivalence and some advantages of
+modeling heights, we propose HeightFormer, which models heights and
+uncertainties in a self-recursive way. Without any extra data, the proposed
+HeightFormer could estimate heights in BEV accurately. Benchmark results show
+that the performance of HeightFormer achieves SOTA compared with those
+camera-only methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NormAUG: Normalization-guided Augmentation for Domain Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13492v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13492v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Qi, Hongpeng Yang, Yinghuan Shi, Xin Geng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has made significant advancements in supervised learning.
+However, models trained in this setting often face challenges due to domain
+shift between training and test sets, resulting in a significant drop in
+performance during testing. To address this issue, several domain
+generalization methods have been developed to learn robust and domain-invariant
+features from multiple training domains that can generalize well to unseen test
+domains. Data augmentation plays a crucial role in achieving this goal by
+enhancing the diversity of the training data. In this paper, inspired by the
+observation that normalizing an image with different statistics generated by
+different batches with various domains can perturb its feature, we propose a
+simple yet effective method called NormAUG (Normalization-guided Augmentation).
+Our method includes two paths: the main path and the auxiliary (augmented)
+path. During training, the auxiliary path includes multiple sub-paths, each
+corresponding to batch normalization for a single domain or a random
+combination of multiple domains. This introduces diverse information at the
+feature level and improves the generalization of the main path. Moreover, our
+NormAUG method effectively reduces the existing upper boundary for
+generalization based on theoretical perspectives. During the test stage, we
+leverage an ensemble strategy to combine the predictions from the auxiliary
+path of our model, further boosting performance. Extensive experiments are
+conducted on multiple benchmark datasets to validate the effectiveness of our
+proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cos R-CNN for Online Few-shot Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13485v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13485v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gratianus Wesley Putra Data, Henry Howard-Jenkins, David Murray, Victor Prisacariu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Cos R-CNN, a simple exemplar-based R-CNN formulation that is
+designed for online few-shot object detection. That is, it is able to localise
+and classify novel object categories in images with few examples without
+fine-tuning. Cos R-CNN frames detection as a learning-to-compare task: unseen
+classes are represented as exemplar images, and objects are detected based on
+their similarity to these exemplars. The cosine-based classification head
+allows for dynamic adaptation of classification parameters to the exemplar
+embedding, and encourages the clustering of similar classes in embedding space
+without the need for manual tuning of distance-metric hyperparameters. This
+simple formulation achieves best results on the recently proposed 5-way
+ImageNet few-shot detection benchmark, beating the online 1/5/10-shot scenarios
+by more than 8/3/1%, as well as performing up to 20% better in online 20-way
+few-shot VOC across all shots on novel classes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Unpublished tech report from 2020</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unlocking the Emotional World of Visual Media: An <span class="highlight-title">Overview</span> of the
+  Science, Research, and Impact of Understanding Emotion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James Z. Wang, Sicheng Zhao, Chenyan Wu, Reginald B. Adams, Michelle G. Newman, Tal Shafir, Rachelle Tsachor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of artificial emotional intelligence technology is
+revolutionizing the fields of computers and robotics, allowing for a new level
+of communication and understanding of human behavior that was once thought
+impossible. While recent advancements in deep learning have transformed the
+field of computer vision, automated understanding of evoked or expressed
+emotions in visual media remains in its infancy. This foundering stems from the
+absence of a universally accepted definition of "emotion", coupled with the
+inherently subjective nature of emotions and their intricate nuances. In this
+article, we provide a comprehensive, multidisciplinary overview of the field of
+emotion analysis in visual media, drawing on insights from psychology,
+engineering, and the arts. We begin by exploring the psychological foundations
+of emotion and the computational principles that underpin the understanding of
+emotions from images and videos. We then review the latest research and systems
+within the field, accentuating the most promising approaches. We also discuss
+the current technological challenges and limitations of emotion analysis,
+underscoring the necessity for continued investigation and innovation. We
+contend that this represents a "Holy Grail" research problem in computing and
+delineate pivotal directions for future inquiry. Finally, we examine the
+ethical ramifications of emotion-understanding technologies and contemplate
+their potential societal impacts. Overall, this article endeavors to equip
+readers with a deeper understanding of the domain of emotion analysis in visual
+media and to inspire further research and development in this captivating and
+rapidly evolving field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the IEEE 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weakly-supervised 3D Pose Transfer with Keypoints <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13459v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13459v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinnan Chen, Chen Li, Gim Hee Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main challenges of 3D pose transfer are: 1) Lack of paired training data
+with different characters performing the same pose; 2) Disentangling pose and
+shape information from the target mesh; 3) Difficulty in applying to meshes
+with different topologies. We thus propose a novel weakly-supervised
+keypoint-based framework to overcome these difficulties. Specifically, we use a
+topology-agnostic keypoint detector with inverse kinematics to compute
+transformations between the source and target meshes. Our method only requires
+supervision on the keypoints, can be applied to meshes with different
+topologies and is shape-invariant for the target which allows extraction of
+pose-only information from the target meshes without transferring shape
+information. We further design a cycle reconstruction to perform
+self-supervised pose transfer without the need for ground truth deformed mesh
+with the same pose and shape as the target and source, respectively. We
+evaluate our approach on benchmark human and animal datasets, where we achieve
+superior performance compared to the state-of-the-art unsupervised approaches
+and even comparable performance with the fully supervised approaches. We test
+on the more challenging Mixamo dataset to verify our approach's ability in
+handling meshes with different topologies and complex clothes. Cross-dataset
+evaluation further shows the strong generalization ability of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Explainable Model-Agnostic Algorithm for CNN-based Biometrics
+  Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernando Alonso-Fernandez, Kevin Hernandez-Diaz, Jose M. Buades, Prayag Tiwari, Josef Bigun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes an adaptation of the Local Interpretable Model-Agnostic
+Explanations (LIME) AI method to operate under a biometric verification
+setting. LIME was initially proposed for networks with the same output classes
+used for training, and it employs the softmax probability to determine which
+regions of the image contribute the most to classification. However, in a
+verification setting, the classes to be recognized have not been seen during
+training. In addition, instead of using the softmax output, face descriptors
+are usually obtained from a layer before the classification layer. The model is
+adapted to achieve explainability via cosine similarity between feature vectors
+of perturbated versions of the input image. The method is showcased for face
+biometrics with two CNN models based on MobileNetv2 and ResNet50.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A signal processing interpretation of noise-reduction convolutional
+  neural networks <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis A. Zavala-Mondragón, Peter H. N. de With, Fons van der Sommen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Encoding-decoding CNNs play a central role in data-driven noise reduction and
+can be found within numerous deep-learning algorithms. However, the development
+of these CNN architectures is often done in ad-hoc fashion and theoretical
+underpinnings for important design choices is generally lacking. Up to this
+moment there are different existing relevant works that strive to explain the
+internal operation of these CNNs. Still, these ideas are either scattered
+and/or may require significant expertise to be accessible for a bigger
+audience. In order to open up this exciting field, this article builds
+intuition on the theory of deep convolutional framelets and explains diverse ED
+CNN architectures in a unified theoretical framework. By connecting basic
+principles from signal processing to the field of deep learning, this
+self-contained material offers significant guidance for designing robust and
+efficient novel CNN architectures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article is currently accepted in IEEE Signal Processing Magazine
+  (SPM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Memory Wall Effects in CNN Engines with On-the-Fly Weights
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13412v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13412v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stylianos I. Venieris, Javier Fernandez-Marques, Nicholas D. Lane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The unprecedented accuracy of convolutional neural networks (CNNs) across a
+broad range of AI tasks has led to their widespread deployment in mobile and
+embedded settings. In a pursuit for high-performance and energy-efficient
+inference, significant research effort has been invested in the design of
+FPGA-based CNN accelerators. In this context, single computation engines
+constitute a popular approach to support diverse CNN modes without the overhead
+of fabric reconfiguration. Nevertheless, this flexibility often comes with
+significantly degraded performance on memory-bound layers and resource
+underutilisation due to the suboptimal mapping of certain layers on the
+engine's fixed configuration. In this work, we investigate the implications in
+terms of CNN engine design for a class of models that introduce a
+pre-convolution stage to decompress the weights at run time. We refer to these
+approaches as on-the-fly. This paper presents unzipFPGA, a novel CNN inference
+system that counteracts the limitations of existing CNN engines. The proposed
+framework comprises a novel CNN hardware architecture that introduces a weights
+generator module that enables the on-chip on-the-fly generation of weights,
+alleviating the negative impact of limited bandwidth on memory-bound layers. We
+further enhance unzipFPGA with an automated hardware-aware methodology that
+tailors the weights generation mechanism to the target CNN-device pair, leading
+to an improved accuracy-performance balance. Finally, we introduce an input
+selective processing element (PE) design that balances the load between PEs in
+suboptimally mapped layers. The proposed framework yields hardware designs that
+achieve an average of 2.57x performance efficiency gain over highly optimised
+GPU designs for the same power constraints and up to 3.94x higher performance
+density over a diverse range of state-of-the-art FPGA-based CNN accelerators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM TODAES, 2023. arXiv admin note: substantial text
+  overlap with arXiv:2103.05600</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scoring Cycling Environments Perceived Safety using Pairwise Image
+  Comparisons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13397v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13397v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel Costa, Manuel Marques, Felix Wilhelm Siebert, Carlos Lima Azevedo, Filipe Moura
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Today, many cities seek to transition to more sustainable transportation
+systems. Cycling is critical in this transition for shorter trips, including
+first-and-last-mile links to transit. Yet, if individuals perceive cycling as
+unsafe, they will not cycle and choose other transportation modes. This study
+presents a novel approach to identifying how the perception of cycling safety
+can be analyzed and understood and the impact of the built environment and
+cycling contexts on such perceptions. We base our work on other perception
+studies and pairwise comparisons, using real-world images to survey
+respondents. We repeatedly show respondents two road environments and ask them
+to select the one they perceive as safer for cycling. We compare several
+methods capable of rating cycling environments from pairwise comparisons and
+classify cycling environments perceived as safe or unsafe. Urban planning can
+use this score to improve interventions' effectiveness and improve cycling
+promotion campaigns. Furthermore, this approach facilitates the continuous
+assessment of changing cycling environments, allows for a short-term evaluation
+of measures, and is efficiently deployed in different locations or contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Unifying Anatomy Segmentation: Automated Generation of a
+  Full-body CT <span class="highlight-title">Dataset</span> via Knowledge Aggregation and Anatomical Guidelines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Jaus, Constantin Seibold, Kelsey Hermann, Alexandra Walter, Kristina Giske, Johannes Haubold, Jens Kleesiek, Rainer Stiefelhagen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we present a method for generating automated anatomy
+segmentation datasets using a sequential process that involves nnU-Net-based
+pseudo-labeling and anatomy-guided pseudo-label refinement. By combining
+various fragmented knowledge bases, we generate a dataset of whole-body CT
+scans with $142$ voxel-level labels for 533 volumes providing comprehensive
+anatomical coverage which experts have approved. Our proposed procedure does
+not rely on manual annotation during the label aggregation stage. We examine
+its plausibility and usefulness using three complementary checks: Human expert
+evaluation which approved the dataset, a Deep Learning usefulness benchmark on
+the BTCV dataset in which we achieve 85% dice score without using its training
+dataset, and medical validity checks. This evaluation procedure combines
+scalable automated checks with labor-intensive high-quality expert checks.
+Besides the dataset, we release our trained unified anatomical segmentation
+model capable of predicting $142$ anatomical structures on CT data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 8 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kefa: A Knowledge Enhanced and Fine-grained Aligned Speaker for
+  Navigation Instruction Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13368v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13368v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haitian Zeng, Xiaohan Wang, Wenguan Wang, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel speaker model \textsc{Kefa} for navigation instruction
+generation. The existing speaker models in Vision-and-Language Navigation
+suffer from the large domain gap of vision features between different
+environments and insufficient temporal grounding capability. To address the
+challenges, we propose a Knowledge Refinement Module to enhance the feature
+representation with external knowledge facts, and an Adaptive Temporal
+Alignment method to enforce fine-grained alignment between the generated
+instructions and the observation sequences. Moreover, we propose a new metric
+SPICE-D for navigation instruction evaluation, which is aware of the
+correctness of direction phrases. The experimental results on R2R and UrbanWalk
+datasets show that the proposed KEFA speaker achieves state-of-the-art
+instruction generation performance for both indoor and outdoor scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3DRP-Net: 3D Relative Position-aware Network for 3D Visual Grounding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13363v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13363v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zehan Wang, Haifeng Huang, Yang Zhao, Linjun Li, Xize Cheng, Yichen Zhu, Aoxiong Yin, Zhou Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D visual grounding aims to localize the target object in a 3D point cloud by
+a free-form language description. Typically, the sentences describing the
+target object tend to provide information about its relative relation between
+other objects and its position within the whole scene. In this work, we propose
+a relation-aware one-stage framework, named 3D Relative Position-aware Network
+(3DRP-Net), which can effectively capture the relative spatial relationships
+between objects and enhance object attributes. Specifically, 1) we propose a 3D
+Relative Position Multi-head Attention (3DRP-MA) module to analyze relative
+relations from different directions in the context of object pairs, which helps
+the model to focus on the specific object relations mentioned in the sentence.
+2) We designed a soft-labeling strategy to alleviate the spatial ambiguity
+caused by redundant points, which further stabilizes and enhances the learning
+process through a constant and discriminative distribution. Extensive
+experiments conducted on three benchmarks (i.e., ScanRefer and Nr3D/Sr3D)
+demonstrate that our method outperforms all the state-of-the-art methods in
+general. The source code will be released on GitHub.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Of Mice and Pose: 2D Mouse Pose Estimation from Unlabelled Data and
+  Synthetic Prior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jose Sosa, Sharn Perry, Jane Alty, David Hogg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numerous fields, such as ecology, biology, and neuroscience, use animal
+recordings to track and measure animal behaviour. Over time, a significant
+volume of such data has been produced, but some computer vision techniques
+cannot explore it due to the lack of annotations. To address this, we propose
+an approach for estimating 2D mouse body pose from unlabelled images using a
+synthetically generated empirical pose prior. Our proposal is based on a recent
+self-supervised method for estimating 2D human pose that uses single images and
+a set of unpaired typical 2D poses within a GAN framework. We adapt this method
+to the limb structure of the mouse and generate the empirical prior of 2D poses
+from a synthetic 3D mouse model, thereby avoiding manual annotation. In
+experiments on a new mouse video dataset, we evaluate the performance of the
+approach by comparing pose predictions to a manually obtained ground truth. We
+also compare predictions with those from a supervised state-of-the-art method
+for animal pose estimation. The latter evaluation indicates promising results
+despite the lack of paired training data. Finally, qualitative results using a
+dataset of horse images show the potential of the setting to adapt to other
+animal species.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the International Conference on Computer Vision Systems
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do humans and Convolutional Neural Networks attend to similar areas
+  during scene classification: Effects of task and image type 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13345v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13345v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Romy Müller, Marcel Duerschmidt, Julian Ullrich, Carsten Knoll, Sascha Weber, Steffen Seitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Learning models like Convolutional Neural Networks (CNN) are powerful
+image classifiers, but what factors determine whether they attend to similar
+image areas as humans do? While previous studies have focused on technological
+factors, little is known about the role of factors that affect human attention.
+In the present study, we investigated how the tasks used to elicit human
+attention maps interact with image characteristics in modulating the similarity
+between humans and CNN. We varied the intentionality of human tasks, ranging
+from spontaneous gaze during categorization over intentional gaze-pointing up
+to manual area selection. Moreover, we varied the type of image to be
+categorized, using either singular, salient objects, indoor scenes consisting
+of object arrangements, or landscapes without distinct objects defining the
+category. The human attention maps generated in this way were compared to the
+CNN attention maps revealed by explainable artificial intelligence (Grad-CAM).
+The influence of human tasks strongly depended on image type: For objects,
+human manual selection produced maps that were most similar to CNN, while the
+specific eye movement task has little impact. For indoor scenes, spontaneous
+gaze produced the least similarity, while for landscapes, similarity was
+equally low across all human tasks. To better understand these results, we also
+compared the different human attention maps to each other. Our results
+highlight the importance of taking human factors into account when comparing
+the attention of humans and CNN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prior Based Online Lane Graph Extraction from Single Onboard Camera
+  Image <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yigit Baran Can, Alexander Liniger, Danda Pani Paudel, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The local road network information is essential for autonomous navigation.
+This information is commonly obtained from offline HD-Maps in terms of lane
+graphs. However, the local road network at a given moment can be drastically
+different than the one given in the offline maps; due to construction works,
+accidents etc. Moreover, the autonomous vehicle might be at a location not
+covered in the offline HD-Map. Thus, online estimation of the lane graph is
+crucial for widespread and reliable autonomous navigation. In this work, we
+tackle online Bird's-Eye-View lane graph extraction from a single onboard
+camera image. We propose to use prior information to increase quality of the
+estimations. The prior is extracted from the dataset through a transformer
+based Wasserstein Autoencoder. The autoencoder is then used to enhance the
+initial lane graph estimates. This is done through optimization of the latent
+space vector. The optimization encourages the lane graph estimation to be
+logical by discouraging it to diverge from the prior distribution. We test the
+method on two benchmark datasets, NuScenes and Argoverse. The results show that
+the proposed method significantly improves the performance compared to
+state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ITSC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Overcoming Distribution Mismatch in Quantizing Image Super-Resolution
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13337v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13337v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheeun Hong, Kyoung Mu Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantization is a promising approach to reduce the high computational
+complexity of image super-resolution (SR) networks. However, compared to
+high-level tasks like image classification, low-bit quantization leads to
+severe accuracy loss in SR networks. This is because feature distributions of
+SR networks are significantly divergent for each channel or input image, and is
+thus difficult to determine a quantization range. Existing SR quantization
+works approach this distribution mismatch problem by dynamically adapting
+quantization ranges to the variant distributions during test time. However,
+such dynamic adaptation incurs additional computational costs that limit the
+benefits of quantization. Instead, we propose a new quantization-aware training
+framework that effectively Overcomes the Distribution Mismatch problem in SR
+networks without the need for dynamic adaptation. Intuitively, the mismatch can
+be reduced by directly regularizing the variance in features during training.
+However, we observe that variance regularization can collide with the
+reconstruction loss during training and adversely impact SR accuracy. Thus, we
+avoid the conflict between two losses by regularizing the variance only when
+the gradients of variance regularization are cooperative with that of
+reconstruction. Additionally, to further reduce the distribution mismatch, we
+introduce distribution offsets to layers with a significant mismatch, which
+either scales or shifts channel-wise features. Our proposed algorithm, called
+ODM, effectively reduces the mismatch in distributions with minimal
+computational overhead. Experimental results show that ODM effectively
+outperforms existing SR quantization approaches with similar or fewer
+computations, demonstrating the importance of reducing the distribution
+mismatch problem. Our code is available at https://github.com/Cheeun/ODM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unmasking Anomalies in Road-Scene Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shyam Nandan Rai, Fabio Cermelli, Dario Fontanel, Carlo Masone, Barbara Caputo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly segmentation is a critical task for driving applications, and it is
+approached traditionally as a per-pixel classification problem. However,
+reasoning individually about each pixel without considering their contextual
+semantics results in high uncertainty around the objects' boundaries and
+numerous false positives. We propose a paradigm change by shifting from a
+per-pixel classification to a mask classification. Our mask-based method,
+Mask2Anomaly, demonstrates the feasibility of integrating an anomaly detection
+method in a mask-classification architecture. Mask2Anomaly includes several
+technical novelties that are designed to improve the detection of anomalies in
+masks: i) a global masked attention module to focus individually on the
+foreground and background regions; ii) a mask contrastive learning that
+maximizes the margin between an anomaly and known classes; and iii) a mask
+refinement solution to reduce false positives. Mask2Anomaly achieves new
+state-of-the-art results across a range of benchmarks, both in the per-pixel
+and component-level evaluations. In particular, Mask2Anomaly reduces the
+average false positives rate by 60% wrt the previous state-of-the-art. Github
+page:
+https://github.com/shyam671/Mask2Anomaly-Unmasking-Anomalies-in-Road-Scene-Segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Cross-client GANs-based Attack in Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong Huang, Xinyu Lei, Tao Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning makes multimedia data (e.g., images) more attractive,
+however, multimedia data is usually distributed and privacy sensitive. Multiple
+distributed multimedia clients can resort to federated learning (FL) to jointly
+learn a global shared model without requiring to share their private samples
+with any third-party entities. In this paper, we show that FL suffers from the
+cross-client generative adversarial networks (GANs)-based (C-GANs) attack, in
+which a malicious client (i.e., adversary) can reconstruct samples with the
+same distribution as the training samples from other clients (i.e., victims).
+Since a benign client's data can be leaked to the adversary, this attack brings
+the risk of local data leakage for clients in many security-critical FL
+applications. Thus, we propose Fed-EDKD (i.e., Federated Ensemble Data-free
+Knowledge Distillation) technique to improve the current popular FL schemes to
+resist C-GANs attack. In Fed-EDKD, each client submits a local model to the
+server for obtaining an ensemble global model. Then, to avoid model expansion,
+Fed-EDKD adopts data-free knowledge distillation techniques to transfer
+knowledge from the ensemble global model to a compressed model. By this way,
+Fed-EDKD reduces the adversary's control capability over the global model, so
+Fed-EDKD can effectively mitigate C-GANs attack. Finally, the experimental
+results demonstrate that Fed-EDKD significantly mitigates C-GANs attack while
+only incurring a slight accuracy degradation of FL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CT-Net: Arbitrary-Shaped Text Detection via Contour <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13310v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13310v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwen Shao, Yuchen Su, Yong Zhou, Fanrong Meng, Hancheng Zhu, Bing Liu, Rui Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contour based scene text detection methods have rapidly developed recently,
+but still suffer from inaccurate frontend contour initialization, multi-stage
+error accumulation, or deficient local information aggregation. To tackle these
+limitations, we propose a novel arbitrary-shaped scene text detection framework
+named CT-Net by progressive contour regression with contour transformers.
+Specifically, we first employ a contour initialization module that generates
+coarse text contours without any post-processing. Then, we adopt contour
+refinement modules to adaptively refine text contours in an iterative manner,
+which are beneficial for context information capturing and progressive global
+contour deformation. Besides, we propose an adaptive training strategy to
+enable the contour transformers to learn more potential deformation paths, and
+introduce a re-score mechanism that can effectively suppress false positives.
+Extensive experiments are conducted on four challenging datasets, which
+demonstrate the accuracy and efficiency of our CT-Net over state-of-the-art
+methods. Particularly, CT-Net achieves F-measure of 86.1 at 11.2 frames per
+second (FPS) and F-measure of 87.8 at 10.1 FPS for CTW1500 and Total-Text
+datasets, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by IEEE Transactions on Circuits and
+  Systems for Video Technology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mini-PointNetPlus: a local feature descriptor in deep learning model for
+  3d environment perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13300v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13300v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanyu Luo, Nuo Cheng, Sikun Ma, Jun Xiang, Xiaohan Li, Shengguang Lei, Pu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Common deep learning models for 3D environment perception often use
+pillarization/voxelization methods to convert point cloud data into
+pillars/voxels and then process it with a 2D/3D convolutional neural network
+(CNN). The pioneer work PointNet has been widely applied as a local feature
+descriptor, a fundamental component in deep learning models for 3D perception,
+to extract features of a point cloud. This is achieved by using a symmetric
+max-pooling operator which provides unique pillar/voxel features. However, by
+ignoring most of the points, the max-pooling operator causes an information
+loss, which reduces the model performance. To address this issue, we propose a
+novel local feature descriptor, mini-PointNetPlus, as an alternative for
+plug-and-play to PointNet. Our basic idea is to separately project the data
+points to the individual features considered, each leading to a permutation
+invariant. Thus, the proposed descriptor transforms an unordered point cloud to
+a stable order. The vanilla PointNet is proved to be a special case of our
+mini-PointNetPlus. Due to fully utilizing the features by the proposed
+descriptor, we demonstrate in experiment a considerable performance improvement
+for 3D perception.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Imperceptible Physical Attack against Face Recognition Systems via LED
+  Illumination Modulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13294v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13294v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junbin Fang, Canjian Jiang, You Jiang, Puxi Lin, Zhaojie Chen, Yujing Sun, Siu-Ming Yiu, Zoe L. Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although face recognition starts to play an important role in our daily life,
+we need to pay attention that data-driven face recognition vision systems are
+vulnerable to adversarial attacks. However, the current two categories of
+adversarial attacks, namely digital attacks and physical attacks both have
+drawbacks, with the former ones impractical and the latter one conspicuous,
+high-computational and inexecutable. To address the issues, we propose a
+practical, executable, inconspicuous and low computational adversarial attack
+based on LED illumination modulation. To fool the systems, the proposed attack
+generates imperceptible luminance changes to human eyes through fast intensity
+modulation of scene LED illumination and uses the rolling shutter effect of
+CMOS image sensors in face recognition systems to implant luminance information
+perturbation to the captured face images. In summary,we present a
+denial-of-service (DoS) attack for face detection and a dodging attack for face
+verification. We also evaluate their effectiveness against well-known face
+detection models, Dlib, MTCNN and RetinaFace , and face verification models,
+Dlib, FaceNet,and ArcFace.The extensive experiments show that the success rates
+of DoS attacks against face detection models reach 97.67%, 100%, and 100%,
+respectively, and the success rates of dodging attacks against all face
+verification models reach 100%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-Resolution Volumetric Reconstruction for Clothed Humans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sicong Tang, Guangyuan Wang, Qing Ran, Lingzhi Li, Li Shen, Ping Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel method for reconstructing clothed humans from a sparse set
+of, e.g., 1 to 6 RGB images. Despite impressive results from recent works
+employing deep implicit representation, we revisit the volumetric approach and
+demonstrate that better performance can be achieved with proper system design.
+The volumetric representation offers significant advantages in leveraging 3D
+spatial context through 3D convolutions, and the notorious quantization error
+is largely negligible with a reasonably large yet affordable volume resolution,
+e.g., 512. To handle memory and computation costs, we propose a sophisticated
+coarse-to-fine strategy with voxel culling and subspace sparse convolution. Our
+method starts with a discretized visual hull to compute a coarse shape and then
+focuses on a narrow band nearby the coarse shape for refinement. Once the shape
+is reconstructed, we adopt an image-based rendering approach, which computes
+the colors of surface points by blending input images with learned weights.
+Extensive experimental results show that our method significantly reduces the
+mean point-to-surface (P2S) precision of state-of-the-art methods by more than
+50% to achieve approximately 2mm accuracy with a 512 volume resolution.
+Additionally, images rendered from our textured model achieve a higher peak
+signal-to-noise ratio (PSNR) compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GaitFormer: Revisiting Intrinsic Periodicity for Gait Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13259v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13259v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Wu, Ruixuan Xiao, Kaixin Xu, Jingcheng Ni, Boxun Li, Ziyao Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gait recognition aims to distinguish different walking patterns by analyzing
+video-level human silhouettes, rather than relying on appearance information.
+Previous research on gait recognition has primarily focused on extracting local
+or global spatial-temporal representations, while overlooking the intrinsic
+periodic features of gait sequences, which, when fully utilized, can
+significantly enhance performance. In this work, we propose a plug-and-play
+strategy, called Temporal Periodic Alignment (TPA), which leverages the
+periodic nature and fine-grained temporal dependencies of gait patterns. The
+TPA strategy comprises two key components. The first component is Adaptive
+Fourier-transform Position Encoding (AFPE), which adaptively converts features
+and discrete-time signals into embeddings that are sensitive to periodic
+walking patterns. The second component is the Temporal Aggregation Module
+(TAM), which separates embeddings into trend and seasonal components, and
+extracts meaningful temporal correlations to identify primary components, while
+filtering out random noise. We present a simple and effective baseline method
+for gait recognition, based on the TPA strategy. Extensive experiments
+conducted on three popular public datasets (CASIA-B, OU-MVLP, and GREW)
+demonstrate that our proposed method achieves state-of-the-art performance on
+multiple benchmark tests.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conditional Cross Attention Network for Multi-Space Embedding without
+  Entanglement in Only a SINGLE Network <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13254v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13254v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chull Hwan Song, Taebaek Hwang, Jooyoung Yoon, Shunghyun Choi, Yeong Hyeon Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many studies in vision tasks have aimed to create effective embedding spaces
+for single-label object prediction within an image. However, in reality, most
+objects possess multiple specific attributes, such as shape, color, and length,
+with each attribute composed of various classes. To apply models in real-world
+scenarios, it is essential to be able to distinguish between the granular
+components of an object. Conventional approaches to embedding multiple specific
+attributes into a single network often result in entanglement, where
+fine-grained features of each attribute cannot be identified separately. To
+address this problem, we propose a Conditional Cross-Attention Network that
+induces disentangled multi-space embeddings for various specific attributes
+with only a single backbone. Firstly, we employ a cross-attention mechanism to
+fuse and switch the information of conditions (specific attributes), and we
+demonstrate its effectiveness through a diverse visualization example.
+Secondly, we leverage the vision transformer for the first time to a
+fine-grained image retrieval task and present a simple yet effective framework
+compared to existing methods. Unlike previous studies where performance varied
+depending on the benchmark dataset, our proposed method achieved consistent
+state-of-the-art performance on the FashionAI, DARN, DeepFashion, and Zappos50K
+benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GaPro: Box-Supervised 3D Point Cloud Instance Segmentation Using
+  Gaussian Processes as Pseudo Labelers <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13251v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13251v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tuan Duc Ngo, Binh-Son Hua, Khoi Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instance segmentation on 3D point clouds (3DIS) is a longstanding challenge
+in computer vision, where state-of-the-art methods are mainly based on full
+supervision. As annotating ground truth dense instance masks is tedious and
+expensive, solving 3DIS with weak supervision has become more practical. In
+this paper, we propose GaPro, a new instance segmentation for 3D point clouds
+using axis-aligned 3D bounding box supervision. Our two-step approach involves
+generating pseudo labels from box annotations and training a 3DIS network with
+the resulting labels. Additionally, we employ the self-training strategy to
+improve the performance of our method further. We devise an effective Gaussian
+Process to generate pseudo instance masks from the bounding boxes and resolve
+ambiguities when they overlap, resulting in pseudo instance masks with their
+uncertainty values. Our experiments show that GaPro outperforms previous weakly
+supervised 3D instance segmentation methods and has competitive performance
+compared to state-of-the-art fully supervised ones. Furthermore, we demonstrate
+the robustness of our approach, where we can adapt various state-of-the-art
+fully supervised methods to the weak supervision task by using our pseudo
+labels for training. The source code and trained models are available at
+https://github.com/VinAIResearch/GaPro.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Keyword-Aware Relative Spatio-Temporal Graph Networks for Video Question
+  Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13250v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13250v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Cheng, Hehe Fan, Dongyun Lin, Ying Sun, Mohan Kankanhalli, Joo-Hwee Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The main challenge in video question answering (VideoQA) is to capture and
+understand the complex spatial and temporal relations between objects based on
+given questions. Existing graph-based methods for VideoQA usually ignore
+keywords in questions and employ a simple graph to aggregate features without
+considering relative relations between objects, which may lead to inferior
+performance. In this paper, we propose a Keyword-aware Relative Spatio-Temporal
+(KRST) graph network for VideoQA. First, to make question features aware of
+keywords, we employ an attention mechanism to assign high weights to keywords
+during question encoding. The keyword-aware question features are then used to
+guide video graph construction. Second, because relations are relative, we
+integrate the relative relation modeling to better capture the spatio-temporal
+dynamics among object nodes. Moreover, we disentangle the spatio-temporal
+reasoning into an object-level spatial graph and a frame-level temporal graph,
+which reduces the impact of spatial and temporal relation reasoning on each
+other. Extensive experiments on the TGIF-QA, MSVD-QA and MSRVTT-QA datasets
+demonstrate the superiority of our KRST over multiple state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Granularity Prediction with Learnable Fusion for Scene Text
+  Recognition <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13244v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13244v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Da, Peng Wang, Cong Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the enormous technical challenges and wide range of applications,
+scene text recognition (STR) has been an active research topic in computer
+vision for years. To tackle this tough problem, numerous innovative methods
+have been successively proposed, and incorporating linguistic knowledge into
+STR models has recently become a prominent trend. In this work, we first draw
+inspiration from the recent progress in Vision Transformer (ViT) to construct a
+conceptually simple yet functionally powerful vision STR model, which is built
+upon ViT and a tailored Adaptive Addressing and Aggregation (A$^3$) module. It
+already outperforms most previous state-of-the-art models for scene text
+recognition, including both pure vision models and language-augmented methods.
+To integrate linguistic knowledge, we further propose a Multi-Granularity
+Prediction strategy to inject information from the language modality into the
+model in an implicit way, \ie, subword representations (BPE and WordPiece)
+widely used in NLP are introduced into the output space, in addition to the
+conventional character level representation, while no independent language
+model (LM) is adopted. To produce the final recognition results, two strategies
+for effectively fusing the multi-granularity predictions are devised. The
+resultant algorithm (termed MGP-STR) is able to push the performance envelope
+of STR to an even higher level. Specifically, MGP-STR achieves an average
+recognition accuracy of $94\%$ on standard benchmarks for scene text
+recognition. Moreover, it also achieves state-of-the-art results on widely-used
+handwritten benchmarks as well as more challenging scene text datasets,
+demonstrating the generality of the proposed MGP-STR algorithm. The source code
+and models will be available at:
+\url{https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/OCR/MGP-STR}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to TPAMI; an extension to our previous ECCV 2022 paper
+  arXiv:2209.03592</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fashion Matrix: Editing Photos by Just Talking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Chong, Xujie Zhang, Fuwei Zhao, Zhenyu Xie, Xiaodan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The utilization of Large Language Models (LLMs) for the construction of AI
+systems has garnered significant attention across diverse fields. The extension
+of LLMs to the domain of fashion holds substantial commercial potential but
+also inherent challenges due to the intricate semantic interactions in
+fashion-related generation. To address this issue, we developed a hierarchical
+AI system called Fashion Matrix dedicated to editing photos by just talking.
+This system facilitates diverse prompt-driven tasks, encompassing garment or
+accessory replacement, recoloring, addition, and removal. Specifically, Fashion
+Matrix employs LLM as its foundational support and engages in iterative
+interactions with users. It employs a range of Semantic Segmentation Models
+(e.g., Grounded-SAM, MattingAnything, etc.) to delineate the specific editing
+masks based on user instructions. Subsequently, Visual Foundation Models (e.g.,
+Stable Diffusion, ControlNet, etc.) are leveraged to generate edited images
+from text prompts and masks, thereby facilitating the automation of fashion
+editing processes. Experiments demonstrate the outstanding ability of Fashion
+Matrix to explores the collaborative potential of functionally diverse
+pre-trained models in the domain of fashion editing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 5 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio-aware Query-enhanced <span class="highlight-title">Transformer</span> for Audio-Visual Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinxiang Liu, Chen Ju, Chaofan Ma, Yanfeng Wang, Yu Wang, Ya Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of the audio-visual segmentation (AVS) task is to segment the
+sounding objects in the video frames using audio cues. However, current
+fusion-based methods have the performance limitations due to the small
+receptive field of convolution and inadequate fusion of audio-visual features.
+To overcome these issues, we propose a novel \textbf{Au}dio-aware
+query-enhanced \textbf{TR}ansformer (AuTR) to tackle the task. Unlike existing
+methods, our approach introduces a multimodal transformer architecture that
+enables deep fusion and aggregation of audio-visual features. Furthermore, we
+devise an audio-aware query-enhanced transformer decoder that explicitly helps
+the model focus on the segmentation of the pinpointed sounding objects based on
+audio signals, while disregarding silent yet salient objects. Experimental
+results show that our method outperforms previous methods and demonstrates
+better generalization ability in multi-sound and open-set scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2305.11019</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Strivec: Sparse Tri-Vector Radiance Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13226v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13226v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quankai Gao, Qiangeng Xu, Hao Su, Ulrich Neumann, Zexiang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Strivec, a novel neural representation that models a 3D scene as a
+radiance field with sparsely distributed and compactly factorized local tensor
+feature grids. Our approach leverages tensor decomposition, following the
+recent work TensoRF, to model the tensor grids. In contrast to TensoRF which
+uses a global tensor and focuses on their vector-matrix decomposition, we
+propose to utilize a cloud of local tensors and apply the classic
+CANDECOMP/PARAFAC (CP) decomposition to factorize each tensor into triple
+vectors that express local feature distributions along spatial axes and
+compactly encode a local neural field. We also apply multi-scale tensor grids
+to discover the geometry and appearance commonalities and exploit spatial
+coherence with the tri-vector factorization at multiple local scales. The final
+radiance field properties are regressed by aggregating neural features from
+multiple local tensors across all scales. Our tri-vector tensors are sparsely
+distributed around the actual scene surface, discovered by a fast coarse
+reconstruction, leveraging the sparsity of a 3D scene. We demonstrate that our
+model can achieve better rendering quality while using significantly fewer
+parameters than previous methods, including TensoRF and Instant-NGP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multilevel Large Language Models for Everyone 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhao Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have made significant progress in the past few years.
+However, they are either generic {\it or} field specific, splitting the
+community into different groups. In this paper, we unify these large language
+models into a larger map, where the generic {\it and} specific models are
+linked together and can improve each other, based on the user personal input
+and information from the internet. The idea of linking several large language
+models together is inspired by the functionality of human brain. The specific
+regions on the brain cortex are specific for certain low level functionality.
+And these regions can jointly work together to achieve more complex high level
+functionality. Such behavior on human brain cortex sheds the light to design
+the multilevel large language models that contain global level, field level and
+user level models. The user level models run on local machines to achieve
+efficient response and protect the user's privacy. Such multilevel models
+reduce some redundancy and perform better than the single level models. The
+proposed multilevel idea can be applied in various applications, such as
+natural language processing, computer vision tasks, professional assistant,
+business and healthcare.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image Segmentation Keras : Implementation of Segnet, FCN, UNet, PSPNet
+  and other models in Keras 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13215v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13215v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Divam Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation plays a vital role in computer vision tasks, enabling
+precise pixel-level understanding of images. In this paper, we present a
+comprehensive library for semantic segmentation, which contains implementations
+of popular segmentation models like SegNet, FCN, UNet, and PSPNet. We also
+evaluate and compare these models on several datasets, offering researchers and
+practitioners a powerful toolset for tackling diverse segmentation challenges.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Investigation into Glomeruli Detection in Kidney H&E and PAS Images
+  using YOLO 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kimia Hemmatirad, Morteza Babaie, Jeffrey Hodgin, Liron Pantanowitz, H. R. Tizhoosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Context: Analyzing digital pathology images is necessary to draw diagnostic
+conclusions by investigating tissue patterns and cellular morphology. However,
+manual evaluation can be time-consuming, expensive, and prone to inter- and
+intra-observer variability. Objective: To assist pathologists using
+computerized solutions, automated tissue structure detection and segmentation
+must be proposed. Furthermore, generating pixel-level object annotations for
+histopathology images is expensive and time-consuming. As a result, detection
+models with bounding box labels may be a feasible solution. Design: This paper
+studies. YOLO-v4 (You-Only-Look-Once), a real-time object detector for
+microscopic images. YOLO uses a single neural network to predict several
+bounding boxes and class probabilities for objects of interest. YOLO can
+enhance detection performance by training on whole slide images. YOLO-v4 has
+been used in this paper. for glomeruli detection in human kidney images.
+Multiple experiments have been designed and conducted based on different
+training data of two public datasets and a private dataset from the University
+of Michigan for fine-tuning the model. The model was tested on the private
+dataset from the University of Michigan, serving as an external validation of
+two different stains, namely hematoxylin and eosin (H&E) and periodic
+acid-Schiff (PAS). Results: Average specificity and sensitivity for all
+experiments, and comparison of existing segmentation methods on the same
+datasets are discussed. Conclusions: Automated glomeruli detection in human
+kidney images is possible using modern AI models. The design and validation for
+different stains still depends on variability of public multi-stain datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pretrain</span>ed Deep 2.5D Models for Efficient Predictive Modeling from
+  Retinal OCT <span class="chip">MICCAI'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taha Emre, Marzieh Oghbaie, Arunava Chakravarty, Antoine Rivail, Sophie Riedl, Julia Mai, Hendrik P. N. Scholl, Sobha Sivaprasad, Daniel Rueckert, Andrew Lotery, Ursula Schmidt-Erfurth, Hrvoje Bogunović
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of medical imaging, 3D deep learning models play a crucial role
+in building powerful predictive models of disease progression. However, the
+size of these models presents significant challenges, both in terms of
+computational resources and data requirements. Moreover, achieving high-quality
+pretraining of 3D models proves to be even more challenging. To address these
+issues, hybrid 2.5D approaches provide an effective solution for utilizing 3D
+volumetric data efficiently using 2D models. Combining 2D and 3D techniques
+offers a promising avenue for optimizing performance while minimizing memory
+requirements. In this paper, we explore 2.5D architectures based on a
+combination of convolutional neural networks (CNNs), long short-term memory
+(LSTM), and Transformers. In addition, leveraging the benefits of recent
+non-contrastive pretraining approaches in 2D, we enhanced the performance and
+data efficiency of 2.5D techniques even further. We demonstrate the
+effectiveness of architectures and associated pretraining on a task of
+predicting progression to wet age-related macular degeneration (AMD) within a
+six-month period on two large longitudinal OCT datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at OMIA-X MICCAI'23 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the unreasonable vulnerability of <span class="highlight-title">transformer</span>s for image restoration
+  -- and an easy fix 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13856v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13856v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Agnihotri, Kanchana Vaishnavi Gandikota, Julia Grabinski, Paramanand Chandramouli, Margret Keuper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Following their success in visual recognition tasks, Vision
+Transformers(ViTs) are being increasingly employed for image restoration. As a
+few recent works claim that ViTs for image classification also have better
+robustness properties, we investigate whether the improved adversarial
+robustness of ViTs extends to image restoration. We consider the recently
+proposed Restormer model, as well as NAFNet and the "Baseline network" which
+are both simplified versions of a Restormer. We use Projected Gradient Descent
+(PGD) and CosPGD, a recently proposed adversarial attack tailored to pixel-wise
+prediction tasks for our robustness evaluation. Our experiments are performed
+on real-world images from the GoPro dataset for image deblurring. Our analysis
+indicates that contrary to as advocated by ViTs in image classification works,
+these models are highly susceptible to adversarial attacks. We attempt to
+improve their robustness through adversarial training. While this yields a
+significant increase in robustness for Restormer, results on other networks are
+less promising. Interestingly, the design choices in NAFNet and Baselines,
+which were based on iid performance, and not on robust generalization, seem to
+be at odds with the model robustness. Thus, we investigate this further and
+find a fix.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tags: Robustness, adversarial attacks, image deblurring, image
+  restoration, NAFNet, Baseline, Restormer, adversarial training</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Sharpened Cosine Similarity <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13855v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13855v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Skyler Wu, Fred Lu, Edward Raff, James Holt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional layers have long served as the primary workhorse for image
+classification. Recently, an alternative to convolution was proposed using the
+Sharpened Cosine Similarity (SCS), which in theory may serve as a better
+feature detector. While multiple sources report promising results, there has
+not been to date a full-scale empirical analysis of neural network performance
+using these new layers. In our work, we explore SCS's parameter behavior and
+potential as a drop-in replacement for convolutions in multiple CNN
+architectures benchmarked on CIFAR-10. We find that while SCS may not yield
+significant increases in accuracy, it may learn more interpretable
+representations. We also find that, in some circumstances, SCS may confer a
+slight increase in adversarial robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to I Can't Believe It's Not Better Workshop (ICBINB) at
+  NeurIPS 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SplitFed resilience to packet loss: Where to split, that is the question <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chamani Shiranthika, Zahra Hafezi Kafshgari, Parvaneh Saeedi, Ivan V. Bajić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decentralized machine learning has broadened its scope recently with the
+invention of Federated Learning (FL), Split Learning (SL), and their hybrids
+like Split Federated Learning (SplitFed or SFL). The goal of SFL is to reduce
+the computational power required by each client in FL and parallelize SL while
+maintaining privacy. This paper investigates the robustness of SFL against
+packet loss on communication links. The performance of various SFL aggregation
+strategies is examined by splitting the model at two points -- shallow split
+and deep split -- and testing whether the split point makes a statistically
+significant difference to the accuracy of the final model. Experiments are
+carried out on a segmentation model for human embryo images and indicate the
+statistically significant advantage of a deeper split point.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures, MICCAI 2023 Workshop on Distributed,
+  Collaborative and Federated Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAEA: Multimodal Attribution for Embodied AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13850v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13850v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vidhi Jain, Jayant Sravan Tamarapalli, Sahiti Yerramilli, Yonatan Bisk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding multimodal perception for embodied AI is an open question
+because such inputs may contain highly complementary as well as redundant
+information for the task. A relevant direction for multimodal policies is
+understanding the global trends of each modality at the fusion layer. To this
+end, we disentangle the attributions for visual, language, and previous action
+inputs across different policies trained on the ALFRED dataset. Attribution
+analysis can be utilized to rank and group the failure scenarios, investigate
+modeling and dataset biases, and critically analyze multimodal EAI policies for
+robustness and user trust before deployment. We present MAEA, a framework to
+compute global attributions per modality of any differentiable policy. In
+addition, we show how attributions enable lower-level behavior analysis in EAI
+policies for language and visual attributions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CosSIF: Cosine similarity-based image filtering to overcome low
+  inter-class variation in synthetic medical image <span class="highlight-title">dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mominul Islam, Hasib Zunair, Nabeel Mohammed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Crafting effective deep learning models for medical image analysis is a
+complex task, particularly in cases where the medical image dataset lacks
+significant inter-class variation. This challenge is further aggravated when
+employing such datasets to generate synthetic images using generative
+adversarial networks (GANs), as the output of GANs heavily relies on the input
+data. In this research, we propose a novel filtering algorithm called Cosine
+Similarity-based Image Filtering (CosSIF). We leverage CosSIF to develop two
+distinct filtering methods: Filtering Before GAN Training (FBGT) and Filtering
+After GAN Training (FAGT). FBGT involves the removal of real images that
+exhibit similarities to images of other classes before utilizing them as the
+training dataset for a GAN. On the other hand, FAGT focuses on eliminating
+synthetic images with less discriminative features compared to real images used
+for training the GAN. Experimental results reveal that employing either the
+FAGT or FBGT method with modern transformer and convolutional-based networks
+leads to substantial performance gains in various evaluation metrics. FAGT
+implementation on the ISIC-2016 dataset surpasses the baseline method in terms
+of sensitivity by 1.59\% and AUC by 1.88\%. Furthermore, for the HAM10000
+dataset, applying FABT outperforms the baseline approach in terms of recall by
+13.75\%, and with the sole implementation of FAGT, achieves a maximum accuracy
+of 94.44\%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 20 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ E^2VPT: An Effective and Efficient Approach for Visual <span class="highlight-title">Prompt</span> Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13770v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13770v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Han, Qifan Wang, Yiming Cui, Zhiwen Cao, Wenguan Wang, Siyuan Qi, Dongfang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the size of transformer-based models continues to grow, fine-tuning these
+large-scale pretrained vision models for new tasks has become increasingly
+parameter-intensive. Parameter-efficient learning has been developed to reduce
+the number of tunable parameters during fine-tuning. Although these methods
+show promising results, there is still a significant performance gap compared
+to full fine-tuning. To address this challenge, we propose an Effective and
+Efficient Visual Prompt Tuning (E^2VPT) approach for large-scale
+transformer-based model adaptation. Specifically, we introduce a set of
+learnable key-value prompts and visual prompts into self-attention and input
+layers, respectively, to improve the effectiveness of model fine-tuning.
+Moreover, we design a prompt pruning procedure to systematically prune low
+importance prompts while preserving model performance, which largely enhances
+the model's efficiency. Empirical results demonstrate that our approach
+outperforms several state-of-the-art baselines on two benchmarks, with
+considerably low parameter usage (e.g., 0.32% of model parameters on VTAB-1k).
+Our code is available at https://github.com/ChengHan111/E2VPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A real-time material breakage detection for offshore wind turbines based
+  on improved neural network algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yantong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integrity of offshore wind turbines, pivotal for sustainable energy
+generation, is often compromised by surface material defects. Despite the
+availability of various detection techniques, limitations persist regarding
+cost-effectiveness, efficiency, and applicability. Addressing these
+shortcomings, this study introduces a novel approach leveraging an advanced
+version of the YOLOv8 object detection model, supplemented with a Convolutional
+Block Attention Module (CBAM) for improved feature recognition. The optimized
+loss function further refines the learning process. Employing a dataset of
+5,432 images from the Saemangeum offshore wind farm and a publicly available
+dataset, our method underwent rigorous testing. The findings reveal a
+substantial enhancement in defect detection stability, marking a significant
+stride towards efficient turbine maintenance. This study's contributions
+illuminate the path for future research, potentially revolutionizing
+sustainable energy practices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2306.16019</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Implementing and Benchmarking the Locally Competitive Algorithm on the
+  Loihi 2 Neuromorphic Processor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gavin Parpart, Sumedh R. Risbud, Garrett T. Kenyon, Yijing Watkins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neuromorphic processors have garnered considerable interest in recent years
+for their potential in energy-efficient and high-speed computing. The Locally
+Competitive Algorithm (LCA) has been utilized for power efficient sparse coding
+on neuromorphic processors, including the first Loihi processor. With the Loihi
+2 processor enabling custom neuron models and graded spike communication, more
+complex implementations of LCA are possible. We present a new implementation of
+LCA designed for the Loihi 2 processor and perform an initial set of benchmarks
+comparing it to LCA on CPU and GPU devices. In these experiments LCA on Loihi 2
+is orders of magnitude more efficient and faster for large sparsity penalties,
+while maintaining similar reconstruction quality. We find this performance
+improvement increases as the LCA parameters are tuned towards greater
+representation sparsity.
+  Our study highlights the potential of neuromorphic processors, particularly
+Loihi 2, in enabling intelligent, autonomous, real-time processing on small
+robots, satellites where there are strict SWaP (small, lightweight, and low
+power) requirements. By demonstrating the superior performance of LCA on Loihi
+2 compared to conventional computing device, our study suggests that Loihi 2
+could be a valuable tool in advancing these types of applications. Overall, our
+study highlights the potential of neuromorphic processors for efficient and
+accurate data processing on resource-constrained devices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PlaneRecTR: Unified Query learning for 3D Plane Recovery from a Single
+  View <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13756v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13756v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingjia Shi, Shuaifeng Zhi, Kai Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D plane recovery from a single image can usually be divided into several
+subtasks of plane detection, segmentation, parameter estimation and possibly
+depth estimation. Previous works tend to solve this task by either extending
+the RCNN-based segmentation network or the dense pixel embedding-based
+clustering framework. However, none of them tried to integrate above related
+subtasks into a unified framework but treat them separately and sequentially,
+which we suspect is potentially a main source of performance limitation for
+existing approaches. Motivated by this finding and the success of query-based
+learning in enriching reasoning among semantic entities, in this paper, we
+propose PlaneRecTR, a Transformer-based architecture, which for the first time
+unifies all subtasks related to single-view plane recovery with a single
+compact model. Extensive quantitative and qualitative experiments demonstrate
+that our proposed unified learning achieves mutual benefits across subtasks,
+obtaining a new state-of-the-art performance on public ScanNet and NYUv2-Plane
+datasets. Codes are available at https://github.com/SJingjia/PlaneRecTR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in Proceedings of IEEE International Conference on
+  Computer Vision (ICCV 2023). Codes: https://github.com/SJingjia/PlaneRecTR ,
+  Video: https://youtu.be/YBB7totHGJg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TMR-RD: Training-based Model Refinement and Representation Disagreement
+  for Semi-Supervised Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyed Mojtaba Marvasti-Zadeh, Nilanjan Ray, Nadir Erbilgin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised object detection (SSOD) can incorporate limited labeled data
+and large amounts of unlabeled data to improve the performance and
+generalization of existing object detectors. Despite many advances, recent SSOD
+methods are still challenged by noisy/misleading pseudo-labels, classical
+exponential moving average (EMA) strategy, and the consensus of Teacher-Student
+models in the latter stages of training. This paper proposes a novel
+training-based model refinement (TMR) stage and a simple yet effective
+representation disagreement (RD) strategy to address the limitations of
+classical EMA and the consensus problem. The TMR stage of Teacher-Student
+models optimizes the lightweight scaling operation to refine the model's
+weights and prevent overfitting or forgetting learned patterns from unlabeled
+data. Meanwhile, the RD strategy helps keep these models diverged to encourage
+the student model to explore complementary representations. In addition, we use
+cascade regression to generate more reliable pseudo-labels for supervising the
+student model. Extensive experiments demonstrate the superior performance of
+our approach over state-of-the-art SSOD methods. Specifically, the proposed
+approach outperforms the Unbiased-Teacher method by an average mAP margin of
+4.6% and 5.3% when using partially-labeled and fully-labeled data on the
+MS-COCO dataset, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ChildGAN: Large Scale Synthetic Child Facial Data Using Domain
+  Adaptation in StyleGAN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Ali Farooq, Wang Yao, Gabriel Costache, Peter Corcoran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this research work, we proposed a novel ChildGAN, a pair of GAN networks
+for generating synthetic boys and girls facial data derived from StyleGAN2.
+ChildGAN is built by performing smooth domain transfer using transfer learning.
+It provides photo-realistic, high-quality data samples. A large-scale dataset
+is rendered with a variety of smart facial transformations: facial expressions,
+age progression, eye blink effects, head pose, skin and hair color variations,
+and variable lighting conditions. The dataset comprises more than 300k distinct
+data samples. Further, the uniqueness and characteristics of the rendered
+facial features are validated by running different computer vision application
+tests which include CNN-based child gender classifier, face localization and
+facial landmarks detection test, identity similarity evaluation using ArcFace,
+and lastly running eye detection and eye aspect ratio tests. The results
+demonstrate that synthetic child facial data of high quality offers an
+alternative to the cost and complexity of collecting a large-scale dataset from
+real children.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Paper is submitted in IEEE Access Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Foundational Models Defining a New Era in Vision: A <span class="highlight-title">Survey</span> and Outlook 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Awais, Muzammal Naseer, Salman Khan, Rao Muhammad Anwer, Hisham Cholakkal, Mubarak Shah, Ming-Hsuan Yang, Fahad Shahbaz Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision systems to see and reason about the compositional nature of visual
+scenes are fundamental to understanding our world. The complex relations
+between objects and their locations, ambiguities, and variations in the
+real-world environment can be better described in human language, naturally
+governed by grammatical rules and other modalities such as audio and depth. The
+models learned to bridge the gap between such modalities coupled with
+large-scale training data facilitate contextual reasoning, generalization, and
+prompt capabilities at test time. These models are referred to as foundational
+models. The output of such models can be modified through human-provided
+prompts without retraining, e.g., segmenting a particular object by providing a
+bounding box, having interactive dialogues by asking questions about an image
+or video scene or manipulating the robot's behavior through language
+instructions. In this survey, we provide a comprehensive review of such
+emerging foundational models, including typical architecture designs to combine
+different modalities (vision, text, audio, etc), training objectives
+(contrastive, generative), pre-training datasets, fine-tuning mechanisms, and
+the common prompting patterns; textual, visual, and heterogeneous. We discuss
+the open challenges and research directions for foundational models in computer
+vision, including difficulties in their evaluations and benchmarking, gaps in
+their real-world understanding, limitations of their contextual understanding,
+biases, vulnerability to adversarial attacks, and interpretability issues. We
+review recent developments in this field, covering a wide range of applications
+of foundation models systematically and comprehensively. A comprehensive list
+of foundational models studied in this work is available at
+\url{https://github.com/awaisrauf/Awesome-CV-Foundational-Models}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page:
+  https://github.com/awaisrauf/Awesome-CV-Foundational-Models</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Composite Diffusion | whole >= Σparts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13720v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13720v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vikram Jamwal, Ramaneswaran S
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For an artist or a graphic designer, the spatial layout of a scene is a
+critical design choice. However, existing text-to-image diffusion models
+provide limited support for incorporating spatial information. This paper
+introduces Composite Diffusion as a means for artists to generate high-quality
+images by composing from the sub-scenes. The artists can specify the
+arrangement of these sub-scenes through a flexible free-form segment layout.
+They can describe the content of each sub-scene primarily using natural text
+and additionally by utilizing reference images or control inputs such as line
+art, scribbles, human pose, canny edges, and more.
+  We provide a comprehensive and modular method for Composite Diffusion that
+enables alternative ways of generating, composing, and harmonizing sub-scenes.
+Further, we wish to evaluate the composite image for effectiveness in both
+image quality and achieving the artist's intent. We argue that existing image
+quality metrics lack a holistic evaluation of image composites. To address
+this, we propose novel quality criteria especially relevant to composite
+generation.
+  We believe that our approach provides an intuitive method of art creation.
+Through extensive user surveys, quantitative and qualitative analysis, we show
+how it achieves greater spatial, semantic, and creative control over image
+generation. In addition, our methods do not need to retrain or modify the
+architecture of the base diffusion models and can work in a plug-and-play
+manner with the fine-tuned models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>44 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comprehensive Analysis on the Leakage of Fuzzy Matchers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Axel Durbet, Paul-Marie Grollemund, Kevin Thiry-Atighehchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The present paper presents a comprehensive analysis of potential information
+leakage in distance evaluation, with a specific emphasis on threshold-based
+obfuscated distance (i.e. Fuzzy Matcher). It includes detailed descriptions of
+various situations related to potential information leakage and specific
+attention is given to their consequences on security. Generic attacks
+corresponding to each scenario are outlined, and their complexities are
+assessed. The main contribution of this work lies in providing an upper bound
+on the security of a fuzzy matcher in scenarios where there is additional
+information leakage from the matcher, providing a straightforward understanding
+of the maximum level of achievable security and its potential implications for
+data privacy and security.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards a Visual-Language Foundation Model for Computational Pathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12914v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12914v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Y. Lu, Bowen Chen, Drew F. K. Williamson, Richard J. Chen, Ivy Liang, Tong Ding, Guillaume Jaume, Igor Odintsov, Andrew Zhang, Long Phi Le, Georg Gerber, Anil V Parwani, Faisal Mahmood
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accelerated adoption of digital pathology and advances in deep learning
+have enabled the development of powerful models for various pathology tasks
+across a diverse array of diseases and patient cohorts. However, model training
+is often difficult due to label scarcity in the medical domain and the model's
+usage is limited by the specific task and disease for which it is trained.
+Additionally, most models in histopathology leverage only image data, a stark
+contrast to how humans teach each other and reason about histopathologic
+entities. We introduce CONtrastive learning from Captions for Histopathology
+(CONCH), a visual-language foundation model developed using diverse sources of
+histopathology images, biomedical text, and notably over 1.17 million
+image-caption pairs via task-agnostic pretraining. Evaluated on a suite of 13
+diverse benchmarks, CONCH can be transferred to a wide range of downstream
+tasks involving either or both histopathology images and text, achieving
+state-of-the-art performance on histology image classification, segmentation,
+captioning, text-to-image and image-to-text retrieval. CONCH represents a
+substantial leap over concurrent visual-language pretrained systems for
+histopathology, with the potential to directly facilitate a wide array of
+machine learning-based workflows requiring minimal or no further supervised
+fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometric Wavelet Scattering Networks on Compact Riemannian Manifolds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1905.10448v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1905.10448v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Perlmutter, Feng Gao, Guy Wolf, Matthew Hirn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Euclidean scattering transform was introduced nearly a decade ago to
+improve the mathematical understanding of convolutional neural networks.
+Inspired by recent interest in geometric deep learning, which aims to
+generalize convolutional neural networks to manifold and graph-structured
+domains, we define a geometric scattering transform on manifolds. Similar to
+the Euclidean scattering transform, the geometric scattering transform is based
+on a cascade of wavelet filters and pointwise nonlinearities. It is invariant
+to local isometries and stable to certain types of diffeomorphisms. Empirical
+results demonstrate its utility on several geometric learning tasks. Our
+results generalize the deformation stability and local translation invariance
+of Euclidean scattering, and demonstrate the importance of linking the used
+filter structures to the underlying geometry of the data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages; 3 figures; 2 tables; v4: Fixed a minor error. Convergence
+  in Equation 13 is in L2 not p.w. modified proof of Theorem 3.3 accordingly</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stabilizing <span class="highlight-title">Transformer</span> Training by Preventing Attention Entropy
+  Collapse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06296v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06296v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuangfei Zhai, Tatiana Likhomanenko, Etai Littwin, Dan Busbridge, Jason Ramapuram, Yizhe Zhang, Jiatao Gu, Josh Susskind
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training stability is of great importance to Transformers. In this work, we
+investigate the training dynamics of Transformers by examining the evolution of
+the attention layers. In particular, we track the attention entropy for each
+attention head during the course of training, which is a proxy for model
+sharpness. We identify a common pattern across different architectures and
+tasks, where low attention entropy is accompanied by high training instability,
+which can take the form of oscillating loss or divergence. We denote the
+pathologically low attention entropy, corresponding to highly concentrated
+attention scores, as $\textit{entropy collapse}$. As a remedy, we propose
+$\sigma$Reparam, a simple and efficient solution where we reparametrize all
+linear layers with spectral normalization and an additional learned scalar. We
+demonstrate that $\sigma$Reparam successfully prevents entropy collapse in the
+attention layers, promoting more stable training. Additionally, we prove a
+tight lower bound of the attention entropy, which decreases exponentially fast
+with the spectral norm of the attention logits, providing additional motivation
+for our approach. We conduct experiments with $\sigma$Reparam on image
+classification, image self-supervised learning, machine translation, speech
+recognition, and language modeling tasks. We show that $\sigma$Reparam provides
+stability and robustness with respect to the choice of hyperparameters, going
+so far as enabling training (a) a Vision Transformer {to competitive
+performance} without warmup, weight decay, layer normalization or adaptive
+optimizers; (b) deep architectures in machine translation and (c) speech
+recognition to competitive performance without warmup and adaptive optimizers.
+Code is available at \url{https://github.com/apple/ml-sigma-reparam}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> video <span class="highlight-title">pretrain</span>ing yields human-aligned visual
+  representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06433v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06433v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Parthasarathy, S. M. Ali Eslami, João Carreira, Olivier J. Hénaff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans learn powerful representations of objects and scenes by observing how
+they evolve over time. Yet, outside of specific tasks that require explicit
+temporal understanding, static image pretraining remains the dominant paradigm
+for learning visual foundation models. We question this mismatch, and ask
+whether video pretraining can yield visual representations that bear the
+hallmarks of human perception: generalisation across tasks, robustness to
+perturbations, and consistency with human judgements. To that end we propose a
+novel procedure for curating videos, and develop a contrastive framework which
+learns from the complex transformations therein. This simple paradigm for
+distilling knowledge from videos, called VITO, yields general representations
+that far outperform prior video pretraining methods on image understanding
+tasks, and image pretraining methods on video understanding tasks. Moreover,
+VITO representations are significantly more robust to natural and synthetic
+deformations than image-, video-, and adversarially-trained ones. Finally,
+VITO's predictions are strongly aligned with human judgements, surpassing
+models that were specifically trained for that purpose. Together, these results
+suggest that video pretraining could be a simple way of learning unified,
+robust, and human-aligned representations of the visual world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TF-ICON: Diffusion-Based Training-Free Cross-Domain Image Composition <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12493v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12493v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shilin Lu, Yanzhu Liu, Adams Wai-Kin Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-driven diffusion models have exhibited impressive generative
+capabilities, enabling various image editing tasks. In this paper, we propose
+TF-ICON, a novel Training-Free Image COmpositioN framework that harnesses the
+power of text-driven diffusion models for cross-domain image-guided
+composition. This task aims to seamlessly integrate user-provided objects into
+a specific visual context. Current diffusion-based methods often involve costly
+instance-based optimization or finetuning of pretrained models on customized
+datasets, which can potentially undermine their rich prior. In contrast,
+TF-ICON can leverage off-the-shelf diffusion models to perform cross-domain
+image-guided composition without requiring additional training, finetuning, or
+optimization. Moreover, we introduce the exceptional prompt, which contains no
+information, to facilitate text-driven diffusion models in accurately inverting
+real images into latent representations, forming the basis for compositing. Our
+experiments show that equipping Stable Diffusion with the exceptional prompt
+outperforms state-of-the-art inversion methods on various datasets (CelebA-HQ,
+COCO, and ImageNet), and that TF-ICON surpasses prior baselines in versatile
+visual domains. Code is available at https://github.com/Shilin-LU/TF-ICON
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrating Curricula with Replays: Its Effects on Continual Learning <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05747v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05747v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ren Jie Tee, Mengmi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans engage in learning and reviewing processes with curricula when
+acquiring new skills or knowledge. This human learning behavior has inspired
+the integration of curricula with replay methods in continual learning agents.
+The goal is to emulate the human learning process, thereby improving knowledge
+retention and facilitating learning transfer. Existing replay methods in
+continual learning agents involve the random selection and ordering of data
+from previous tasks, which has shown to be effective. However, limited research
+has explored the integration of different curricula with replay methods to
+enhance continual learning. Our study takes initial steps in examining the
+impact of integrating curricula with replay methods on continual learning in
+three specific aspects: the interleaved frequency of replayed exemplars with
+training data, the sequence in which exemplars are replayed, and the strategy
+for selecting exemplars into the replay buffer. These aspects of curricula
+design align with cognitive psychology principles and leverage the benefits of
+interleaved practice during replays, easy-to-hard rehearsal, and exemplar
+selection strategy involving exemplars from a uniform distribution of
+difficulties. Based on our results, these three curricula effectively mitigated
+catastrophic forgetting and enhanced positive knowledge transfer, demonstrating
+the potential of curricula in advancing continual learning methodologies. Our
+code and data are available:
+https://github.com/ZhangLab-DeepNeuroCogLab/Integrating-Curricula-with-Replays
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, accepted in AAAI Summer Symposium Series
+  Proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Harmonizing Feature Attributions Across Deep Learning Architectures:
+  Enhancing Interpretability and Consistency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02150v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02150v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Abdul Kadir, Gowtham Krishna Addluri, Daniel Sonntag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring the trustworthiness and interpretability of machine learning models
+is critical to their deployment in real-world applications. Feature attribution
+methods have gained significant attention, which provide local explanations of
+model predictions by attributing importance to individual input features. This
+study examines the generalization of feature attributions across various deep
+learning architectures, such as convolutional neural networks (CNNs) and vision
+transformers. We aim to assess the feasibility of utilizing a feature
+attribution method as a future detector and examine how these features can be
+harmonized across multiple models employing distinct architectures but trained
+on the same data distribution. By exploring this harmonization, we aim to
+develop a more coherent and optimistic understanding of feature attributions,
+enhancing the consistency of local explanations across diverse deep-learning
+models. Our findings highlight the potential for harmonized feature attribution
+methods to improve interpretability and foster trust in machine learning
+applications, regardless of the underlying architecture.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version of the contribution has been submitted in KI2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EdgeAL: An Edge Estimation Based Active Learning Approach for OCT
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10745v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10745v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Abdul Kadir, Hasan Md Tusfiqur Alam, Daniel Sonntag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Active learning algorithms have become increasingly popular for training
+models with limited data. However, selecting data for annotation remains a
+challenging problem due to the limited information available on unseen data. To
+address this issue, we propose EdgeAL, which utilizes the edge information of
+unseen images as {\it a priori} information for measuring uncertainty. The
+uncertainty is quantified by analyzing the divergence and entropy in model
+predictions across edges. This measure is then used to select superpixels for
+annotation. We demonstrate the effectiveness of EdgeAL on multi-class Optical
+Coherence Tomography (OCT) segmentation tasks, where we achieved a 99% dice
+score while reducing the annotation label cost to 12%, 2.3%, and 3%,
+respectively, on three publicly available datasets (Duke, AROI, and UMN). The
+source code is available at \url{https://github.com/Mak-Ta-Reque/EdgeAL}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version of the contribution has been submitted in miccai2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FDCT: Fast Depth Completion for Transparent Objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12274v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12274v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianan Li, Zhehan Chen, Huan Liu, Chen Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth completion is crucial for many robotic tasks such as autonomous
+driving, 3-D reconstruction, and manipulation. Despite the significant
+progress, existing methods remain computationally intensive and often fail to
+meet the real-time requirements of low-power robotic platforms. Additionally,
+most methods are designed for opaque objects and struggle with transparent
+objects due to the special properties of reflection and refraction. To address
+these challenges, we propose a Fast Depth Completion framework for Transparent
+objects (FDCT), which also benefits downstream tasks like object pose
+estimation. To leverage local information and avoid overfitting issues when
+integrating it with global information, we design a new fusion branch and
+shortcuts to exploit low-level features and a loss function to suppress
+overfitting. This results in an accurate and user-friendly depth rectification
+framework which can recover dense depth estimation from RGB-D images alone.
+Extensive experiments demonstrate that FDCT can run about 70 FPS with a higher
+accuracy than the state-of-the-art methods. We also demonstrate that FDCT can
+improve pose estimation in object grasping tasks. The source code is available
+at https://github.com/Nonmy/FDCT
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9pages,7figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From CAD models to soft point cloud labels: An automatic annotation
+  pipeline for cheaply supervised 3D semantic segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.03114v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.03114v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Galadrielle Humblot-Renaux, Simon Buus Jensen, Andreas Møgelmose
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a fully automatic annotation scheme that takes a raw 3D point
+cloud with a set of fitted CAD models as input and outputs convincing
+point-wise labels that can be used as cheap training data for point cloud
+segmentation. Compared with manual annotations, we show that our automatic
+labels are accurate while drastically reducing the annotation time and
+eliminating the need for manual intervention or dataset-specific parameters.
+Our labeling pipeline outputs semantic classes and soft point-wise object
+scores, which can either be binarized into standard one-hot-encoded labels,
+thresholded into weak labels with ambiguous points left unlabeled, or used
+directly as soft labels during training. We evaluate the label quality and
+segmentation performance of PointNet++ on a dataset of real industrial point
+clouds and Scan2CAD, a public dataset of indoor scenes. Our results indicate
+that reducing supervision in areas that are more difficult to label
+automatically is beneficial compared with the conventional approach of naively
+assigning a hard "best guess" label to every point.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>updated version, published in the Remote Sensing journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DataComp: In search of the next generation of multimodal <span class="highlight-title">dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14108v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14108v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samir Yitzhak Gadre, Gabriel Ilharco, Alex Fang, Jonathan Hayase, Georgios Smyrnis, Thao Nguyen, Ryan Marten, Mitchell Wortsman, Dhruba Ghosh, Jieyu Zhang, Eyal Orgad, Rahim Entezari, Giannis Daras, Sarah Pratt, Vivek Ramanujan, Yonatan Bitton, Kalyani Marathe, Stephen Mussmann, Richard Vencu, Mehdi Cherti, Ranjay Krishna, Pang Wei Koh, Olga Saukh, Alexander Ratner, Shuran Song, Hannaneh Hajishirzi, Ali Farhadi, Romain Beaumont, Sewoong Oh, Alex Dimakis, Jenia Jitsev, Yair Carmon, Vaishaal Shankar, Ludwig Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal datasets are a critical component in recent breakthroughs such as
+Stable Diffusion and GPT-4, yet their design does not receive the same research
+attention as model architectures or training algorithms. To address this
+shortcoming in the ML ecosystem, we introduce DataComp, a testbed for dataset
+experiments centered around a new candidate pool of 12.8 billion image-text
+pairs from Common Crawl. Participants in our benchmark design new filtering
+techniques or curate new data sources and then evaluate their new dataset by
+running our standardized CLIP training code and testing the resulting model on
+38 downstream test sets. Our benchmark consists of multiple compute scales
+spanning four orders of magnitude, which enables the study of scaling trends
+and makes the benchmark accessible to researchers with varying resources. Our
+baseline experiments show that the DataComp workflow leads to better training
+sets. In particular, our best baseline, DataComp-1B, enables training a CLIP
+ViT-L/14 from scratch to 79.2% zero-shot accuracy on ImageNet, outperforming
+OpenAI's CLIP ViT-L/14 by 3.7 percentage points while using the same training
+procedure and compute. We release DataComp and all accompanying code at
+www.datacomp.ai.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online Streaming Video Super-Resolution with Convolutional Look-Up Table 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00334v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00334v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanghao Yin, Zefan Qu, Xinyang Jiang, Shan Jiang, Zhenhua Han, Ningxin Zheng, Xiaohong Liu, Huan Yang, Yuqing Yang, Dongsheng Li, Lili Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online video streaming has fundamental limitations on the transmission
+bandwidth and computational capacity and super-resolution is a promising
+potential solution. However, applying existing video super-resolution methods
+to online streaming is non-trivial. Existing video codecs and streaming
+protocols (\eg, WebRTC) dynamically change the video quality both spatially and
+temporally, which leads to diverse and dynamic degradations. Furthermore,
+online streaming has a strict requirement for latency that most existing
+methods are less applicable. As a result, this paper focuses on the rarely
+exploited problem setting of online streaming video super resolution. To
+facilitate the research on this problem, a new benchmark dataset named
+LDV-WebRTC is constructed based on a real-world online streaming system.
+Leveraging the new benchmark dataset, we proposed a novel method specifically
+for online video streaming, which contains a convolution and Look-Up Table
+(LUT) hybrid model to achieve better performance-latency trade-off. To tackle
+the changing degradations, we propose a mixture-of-expert-LUT module, where a
+set of LUT specialized in different degradations are built and adaptively
+combined to handle different degradations. Experiments show our method achieves
+720P video SR around 100 FPS, while significantly outperforms existing
+LUT-based methods and offers competitive performance compared to efficient
+CNN-based methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-linear Neurons with Human-like Apical Dendrite Activations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2003.03229v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2003.03229v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mariana-Iuliana Georgescu, Radu Tudor Ionescu, Nicolae-Catalin Ristea, Nicu Sebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In order to classify linearly non-separable data, neurons are typically
+organized into multi-layer neural networks that are equipped with at least one
+hidden layer. Inspired by some recent discoveries in neuroscience, we propose a
+new model of artificial neuron along with a novel activation function enabling
+the learning of nonlinear decision boundaries using a single neuron. We show
+that a standard neuron followed by our novel apical dendrite activation (ADA)
+can learn the XOR logical function with 100% accuracy. Furthermore, we conduct
+experiments on six benchmark data sets from computer vision, signal processing
+and natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST,
+Tiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions
+provide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and
+Swish, for various neural network architectures, e.g. one-hidden-layer or
+two-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural
+networks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain
+further performance improvements when we change the standard model of the
+neuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our
+code is available at: https://github.com/raduionescu/pynada.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Applied Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leaping Into Memories: Space-Time Deep Feature Synthesis <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09941v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09941v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandros Stergiou, Nikos Deligiannis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of deep learning models has led to their adaptation and adoption
+by prominent video understanding methods. The majority of these approaches
+encode features in a joint space-time modality for which the inner workings and
+learned representations are difficult to visually interpret. We propose LEArned
+Preconscious Synthesis (LEAPS), an architecture-independent method for
+synthesizing videos from the internal spatiotemporal representations of models.
+Using a stimulus video and a target class, we prime a fixed space-time model
+and iteratively optimize a video initialized with random noise. Additional
+regularizers are used to improve the feature diversity of the synthesized
+videos alongside the cross-frame temporal coherence of motions. We
+quantitatively and qualitatively evaluate the applicability of LEAPS by
+inverting a range of spatiotemporal convolutional and attention-based
+architectures trained on Kinetics-400, which to the best of our knowledge has
+not been previously accomplished.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE/CVF International Conference on Computer Vision
+  (ICCV) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ResShift: Efficient Diffusion Model for Image Super-resolution by
+  Residual Shifting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12348v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12348v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongsheng Yue, Jianyi Wang, Chen Change Loy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based image super-resolution (SR) methods are mainly limited by the
+low inference speed due to the requirements of hundreds or even thousands of
+sampling steps. Existing acceleration sampling techniques inevitably sacrifice
+performance to some extent, leading to over-blurry SR results. To address this
+issue, we propose a novel and efficient diffusion model for SR that
+significantly reduces the number of diffusion steps, thereby eliminating the
+need for post-acceleration during inference and its associated performance
+deterioration. Our method constructs a Markov chain that transfers between the
+high-resolution image and the low-resolution image by shifting the residual
+between them, substantially improving the transition efficiency. Additionally,
+an elaborate noise schedule is developed to flexibly control the shifting speed
+and the noise strength during the diffusion process. Extensive experiments
+demonstrate that the proposed method obtains superior or at least comparable
+performance to current state-of-the-art methods on both synthetic and
+real-world datasets, even only with 15 sampling steps. Our code and model are
+available at https://github.com/zsyOAOA/ResShift.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Instance-aware Dynamic <span class="highlight-title">Prompt</span> Tuning for <span class="highlight-title">Pre-train</span>ed Point Cloud Models <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07221v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07221v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaohua Zha, Jinpeng Wang, Tao Dai, Bin Chen, Zhi Wang, Shu-Tao Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained point cloud models have found extensive applications in 3D
+understanding tasks like object classification and part segmentation. However,
+the prevailing strategy of full fine-tuning in downstream tasks leads to large
+per-task storage overhead for model parameters, which limits the efficiency
+when applying large-scale pre-trained models. Inspired by the recent success of
+visual prompt tuning (VPT), this paper attempts to explore prompt tuning on
+pre-trained point cloud models, to pursue an elegant balance between
+performance and parameter efficiency. We find while instance-agnostic static
+prompting, e.g. VPT, shows some efficacy in downstream transfer, it is
+vulnerable to the distribution diversity caused by various types of noises in
+real-world point cloud data. To conquer this limitation, we propose a novel
+Instance-aware Dynamic Prompt Tuning (IDPT) strategy for pre-trained point
+cloud models. The essence of IDPT is to develop a dynamic prompt generation
+module to perceive semantic prior features of each point cloud instance and
+generate adaptive prompt tokens to enhance the model's robustness. Notably,
+extensive experiments demonstrate that IDPT outperforms full fine-tuning in
+most tasks with a mere 7% of the trainable parameters, providing a promising
+solution to parameter-efficient learning for pre-trained point cloud models.
+Code is available at \url{https://github.com/zyh16143998882/ICCV23-IDPT}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text-based Person Search in Full Images via Semantic-Driven Proposal
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.12965v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.12965v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shizhou Zhang, De Cheng, Wenlong Luo, Yinghui Xing, Duo Long, Hao Li, Kai Niu, Guoqiang Liang, Yanning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finding target persons in full scene images with a query of text description
+has important practical applications in intelligent video surveillance.However,
+different from the real-world scenarios where the bounding boxes are not
+available, existing text-based person retrieval methods mainly focus on the
+cross modal matching between the query text descriptions and the gallery of
+cropped pedestrian images. To close the gap, we study the problem of text-based
+person search in full images by proposing a new end-to-end learning framework
+which jointly optimize the pedestrian detection, identification and
+visual-semantic feature embedding tasks. To take full advantage of the query
+text, the semantic features are leveraged to instruct the Region Proposal
+Network to pay more attention to the text-described proposals. Besides, a
+cross-scale visual-semantic embedding mechanism is utilized to improve the
+performance. To validate the proposed method, we collect and annotate two
+large-scale benchmark datasets based on the widely adopted image-based person
+search datasets CUHK-SYSU and PRW. Comprehensive experiments are conducted on
+the two datasets and compared with the baseline methods, our method achieves
+the state-of-the-art performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recent Progress in <span class="highlight-title">Transformer</span>-based Medical Image Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.06643v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.06643v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaoshan Liu, Qiujie Lv, Ziduo Yang, Yifan Li, Chau Hung Lee, Lei Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The transformer is primarily used in the field of natural language
+processing. Recently, it has been adopted and shows promise in the computer
+vision (CV) field. Medical image analysis (MIA), as a critical branch of CV,
+also greatly benefits from this state-of-the-art technique. In this review, we
+first recap the core component of the transformer, the attention mechanism, and
+the detailed structures of the transformer. After that, we depict the recent
+progress of the transformer in the field of MIA. We organize the applications
+in a sequence of different tasks, including classification, segmentation,
+captioning, registration, detection, enhancement, localization, and synthesis.
+The mainstream classification and segmentation tasks are further divided into
+eleven medical image modalities. A large number of experiments studied in this
+review illustrate that the transformer-based method outperforms existing
+methods through comparisons with multiple evaluation metrics. Finally, we
+discuss the open challenges and future opportunities in this field. This
+task-modality review with the latest contents, detailed information, and
+comprehensive comparison may greatly benefit the broad MIA community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Computers in Biology and Medicine Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatiotemporal Modeling Encounters 3D Medical Image Analysis:
+  Slice-Shift UNet with Multi-View Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12853v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12853v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        C. I. Ugwu, S. Casarin, O. Lanz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a fundamental part of computational healthcare, Computer Tomography (CT)
+and Magnetic Resonance Imaging (MRI) provide volumetric data, making the
+development of algorithms for 3D image analysis a necessity. Despite being
+computationally cheap, 2D Convolutional Neural Networks can only extract
+spatial information. In contrast, 3D CNNs can extract three-dimensional
+features, but they have higher computational costs and latency, which is a
+limitation for clinical practice that requires fast and efficient models.
+Inspired by the field of video action recognition we propose a new 2D-based
+model dubbed Slice SHift UNet (SSH-UNet) which encodes three-dimensional
+features at 2D CNN's complexity. More precisely multi-view features are
+collaboratively learned by performing 2D convolutions along the three
+orthogonal planes of a volume and imposing a weights-sharing mechanism. The
+third dimension, which is neglected by the 2D convolution, is reincorporated by
+shifting a portion of the feature maps along the slices' axis. The
+effectiveness of our approach is validated in Multi-Modality Abdominal
+Multi-Organ Segmentation (AMOS) and Multi-Atlas Labeling Beyond the Cranial
+Vault (BTCV) datasets, showing that SSH-UNet is more efficient while on par in
+performance with state-of-the-art architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ StyleHumanCLIP: Text-guided Garment Manipulation for StyleGAN-Human 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16759v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16759v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takato Yoshikawa, Yuki Endo, Yoshihiro Kanamori
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper tackles text-guided control of StyleGAN for editing garments in
+full-body human images. Existing StyleGAN-based methods suffer from handling
+the rich diversity of garments and body shapes and poses. We propose a
+framework for text-guided full-body human image synthesis via an
+attention-based latent code mapper, which enables more disentangled control of
+StyleGAN than existing mappers. Our latent code mapper adopts an attention
+mechanism that adaptively manipulates individual latent codes on different
+StyleGAN layers under text guidance. In addition, we introduce feature-space
+masking at inference time to avoid unwanted changes caused by text inputs. Our
+quantitative and qualitative evaluations reveal that our method can control
+generated images more faithfully to given texts than existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FusionLoc: Camera-2D LiDAR Fusion Using Multi-Head Self-Attention for
+  End-to-End Serving Robot Relocalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06872v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06872v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jieun Lee, Hakjun Lee, Jiyong Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As technology advances in autonomous mobile robots, mobile service robots
+have been actively used more and more for various purposes. Especially, serving
+robots have been not surprising products anymore since the COVID-19 pandemic.
+One of the practical problems in operating a serving robot is that it often
+fails to estimate its pose on a map that it moves around. Whenever the failure
+happens, servers should bring the serving robot to its initial location and
+reboot it manually. In this paper, we focus on end-to-end relocalization of
+serving robots to address the problem. It is to predict robot pose directly
+from only the onboard sensor data using neural networks. In particular, we
+propose a deep neural network architecture for the relocalization based on
+camera-2D LiDAR sensor fusion. We call the proposed method FusionLoc. In the
+proposed method, the multi-head self-attention complements different types of
+information captured by the two sensors to regress the robot pose. Our
+experiments on a dataset collected by a commercial serving robot demonstrate
+that FusionLoc can provide better performances than previous end-to-end
+relocalization methods taking only a single image or a 2D LiDAR point cloud as
+well as a straightforward fusion method concatenating their features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Development of pericardial fat count images using a combination of three
+  different deep-learning models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12316v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12316v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takaaki Matsunaga, Atsushi Kono, Hidetoshi Matsuo, Kaoru Kitagawa, Mizuho Nishio, Hiromi Hashimura, Yu Izawa, Takayoshi Toba, Kazuki Ishikawa, Akie Katsuki, Kazuyuki Ohmura, Takamichi Murakami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rationale and Objectives: Pericardial fat (PF), the thoracic visceral fat
+surrounding the heart, promotes the development of coronary artery disease by
+inducing inflammation of the coronary arteries. For evaluating PF, this study
+aimed to generate pericardial fat count images (PFCIs) from chest radiographs
+(CXRs) using a dedicated deep-learning model.
+  Materials and Methods: The data of 269 consecutive patients who underwent
+coronary computed tomography (CT) were reviewed. Patients with metal implants,
+pleural effusion, history of thoracic surgery, or that of malignancy were
+excluded. Thus, the data of 191 patients were used. PFCIs were generated from
+the projection of three-dimensional CT images, where fat accumulation was
+represented by a high pixel value. Three different deep-learning models,
+including CycleGAN, were combined in the proposed method to generate PFCIs from
+CXRs. A single CycleGAN-based model was used to generate PFCIs from CXRs for
+comparison with the proposed method. To evaluate the image quality of the
+generated PFCIs, structural similarity index measure (SSIM), mean squared error
+(MSE), and mean absolute error (MAE) of (i) the PFCI generated using the
+proposed method and (ii) the PFCI generated using the single model were
+compared.
+  Results: The mean SSIM, MSE, and MAE were as follows: 0.856, 0.0128, and
+0.0357, respectively, for the proposed model; and 0.762, 0.0198, and 0.0504,
+respectively, for the single CycleGAN-based model.
+  Conclusion: PFCIs generated from CXRs with the proposed model showed better
+performance than those with the single model. PFCI evaluation without CT may be
+possible with the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Selecting the motion ground truth for loose-fitting wearables:
+  benchmarking optical MoCap methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11881v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11881v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lala Shakti Swarup Ray, Bo Zhou, Sungho Suh, Paul Lukowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To help smart wearable researchers choose the optimal ground truth methods
+for motion capturing (MoCap) for all types of loose garments, we present a
+benchmark, DrapeMoCapBench (DMCB), specifically designed to evaluate the
+performance of optical marker-based and marker-less MoCap. High-cost
+marker-based MoCap systems are well-known as precise golden standards. However,
+a less well-known caveat is that they require skin-tight fitting markers on
+bony areas to ensure the specified precision, making them questionable for
+loose garments. On the other hand, marker-less MoCap methods powered by
+computer vision models have matured over the years, which have meager costs as
+smartphone cameras would suffice. To this end, DMCB uses large real-world
+recorded MoCap datasets to perform parallel 3D physics simulations with a wide
+range of diversities: six levels of drape from skin-tight to extremely draped
+garments, three levels of motions and six body type - gender combinations to
+benchmark state-of-the-art optical marker-based and marker-less MoCap methods
+to identify the best-performing method in different scenarios. In assessing the
+performance of marker-based and low-cost marker-less MoCap for casual loose
+garments both approaches exhibit significant performance loss (>10cm), but for
+everyday activities involving basic and fast motions, marker-less MoCap
+slightly outperforms marker-based MoCap, making it a favorable and
+cost-effective choice for wearable studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GridMM: Grid Memory Map for Vision-and-Language Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12907v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12907v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Wang, Xiangyang Li, Jiahao Yang, Yeqi Liu, Shuqiang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-and-language navigation (VLN) enables the agent to navigate to a
+remote location following the natural language instruction in 3D environments.
+To represent the previously visited environment, most approaches for VLN
+implement memory using recurrent states, topological maps, or top-down semantic
+maps. In contrast to these approaches, we build the top-down egocentric and
+dynamically growing Grid Memory Map (i.e., GridMM) to structure the visited
+environment. From a global perspective, historical observations are projected
+into a unified grid map in a top-down view, which can better represent the
+spatial relations of the environment. From a local perspective, we further
+propose an instruction relevance aggregation method to capture fine-grained
+visual clues in each grid region. Extensive experiments are conducted on both
+the REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE
+dataset in the continuous environments, showing the superiority of our proposed
+method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nerfstudio: A Modular Framework for Neural Radiance Field Development 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04264v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04264v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Tancik, Ethan Weber, Evonne Ng, Ruilong Li, Brent Yi, Justin Kerr, Terrance Wang, Alexander Kristoffersen, Jake Austin, Kamyar Salahi, Abhik Ahuja, David McAllister, Angjoo Kanazawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRF) are a rapidly growing area of research with
+wide-ranging applications in computer vision, graphics, robotics, and more. In
+order to streamline the development and deployment of NeRF research, we propose
+a modular PyTorch framework, Nerfstudio. Our framework includes plug-and-play
+components for implementing NeRF-based methods, which make it easy for
+researchers and practitioners to incorporate NeRF into their projects.
+Additionally, the modular design enables support for extensive real-time
+visualization tools, streamlined pipelines for importing captured in-the-wild
+data, and tools for exporting to video, point cloud and mesh representations.
+The modularity of Nerfstudio enables the development of Nerfacto, our method
+that combines components from recent papers to achieve a balance between speed
+and quality, while also remaining flexible to future modifications. To promote
+community-driven development, all associated code and data are made publicly
+available with open-source licensing at https://nerf.studio.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page at https://nerf.studio</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CORE: Cooperative Reconstruction for Multi-Agent Perception <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11514v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11514v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binglu Wang, Lei Zhang, Zhaozhong Wang, Yongqiang Zhao, Tianfei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents CORE, a conceptually simple, effective and
+communication-efficient model for multi-agent cooperative perception. It
+addresses the task from a novel perspective of cooperative reconstruction,
+based on two key insights: 1) cooperating agents together provide a more
+holistic observation of the environment, and 2) the holistic observation can
+serve as valuable supervision to explicitly guide the model learning how to
+reconstruct the ideal observation based on collaboration. CORE instantiates the
+idea with three major components: a compressor for each agent to create more
+compact feature representation for efficient broadcasting, a lightweight
+attentive collaboration component for cross-agent message aggregation, and a
+reconstruction module to reconstruct the observation based on aggregated
+feature representations. This learning-to-reconstruct idea is task-agnostic,
+and offers clear and reasonable supervision to inspire more effective
+collaboration, eventually promoting perception tasks. We validate CORE on
+OPV2V, a large-scale multi-agent percetion dataset, in two tasks, i.e., 3D
+object detection and semantic segmentation. Results demonstrate that the model
+achieves state-of-the-art performance on both tasks, and is more
+communication-efficient.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023; Code: https://github.com/zllxot/CORE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatial-temporal <span class="highlight-title">Transformer</span>-guided Diffusion based Data Augmentation
+  for Efficient Skeleton-based Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13434v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13434v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Jiang, Han Chen, Hanseok Ko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, skeleton-based human action has become a hot research topic because
+the compact representation of human skeletons brings new blood to this research
+domain. As a result, researchers began to notice the importance of using RGB or
+other sensors to analyze human action by extracting skeleton information.
+Leveraging the rapid development of deep learning (DL), a significant number of
+skeleton-based human action approaches have been presented with fine-designed
+DL structures recently. However, a well-trained DL model always demands
+high-quality and sufficient data, which is hard to obtain without costing high
+expenses and human labor. In this paper, we introduce a novel data augmentation
+method for skeleton-based action recognition tasks, which can effectively
+generate high-quality and diverse sequential actions. In order to obtain
+natural and realistic action sequences, we propose denoising diffusion
+probabilistic models (DDPMs) that can generate a series of synthetic action
+sequences, and their generation process is precisely guided by a
+spatial-temporal transformer (ST-Trans). Experimental results show that our
+method outperforms the state-of-the-art (SOTA) motion generation approaches on
+different naturality and diversity metrics. It proves that its high-quality
+synthetic data can also be effectively deployed to existing action recognition
+models with significant performance improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SST: Real-time End-to-end Monocular 3D Reconstruction via Sparse
+  Spatial-Temporal Guidance <span class="chip">ICME 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.06524v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.06524v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyangguang Zhang, Zhiqiang Lou, Yan Di, Federico Tombari, Xiangyang Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-time monocular 3D reconstruction is a challenging problem that remains
+unsolved. Although recent end-to-end methods have demonstrated promising
+results, tiny structures and geometric boundaries are hardly captured due to
+their insufficient supervision neglecting spatial details and oversimplified
+feature fusion ignoring temporal cues. To address the problems, we propose an
+end-to-end 3D reconstruction network SST, which utilizes Sparse estimated
+points from visual SLAM system as additional Spatial guidance and fuses
+Temporal features via a novel cross-modal attention mechanism, achieving more
+detailed reconstruction results. We propose a Local Spatial-Temporal Fusion
+module to exploit more informative spatial-temporal cues from multi-view color
+information and sparse priors, as well a Global Spatial-Temporal Fusion module
+to refine the local TSDF volumes with the world-frame model from coarse to
+fine. Extensive experiments on ScanNet and 7-Scenes demonstrate that SST
+outperforms all state-of-the-art competitors, whilst keeping a high inference
+speed at 59 FPS, enabling real-world applications with real-time requirements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICME 2023 (oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FairGen: Towards Fair Graph Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17743v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17743v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lecheng Zheng, Dawei Zhou, Hanghang Tong, Jiejun Xu, Yada Zhu, Jingrui He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There have been tremendous efforts over the past decades dedicated to the
+generation of realistic graphs in a variety of domains, ranging from social
+networks to computer networks, from gene regulatory networks to online
+transaction networks. Despite the remarkable success, the vast majority of
+these works are unsupervised in nature and are typically trained to minimize
+the expected graph reconstruction loss, which would result in the
+representation disparity issue in the generated graphs, i.e., the protected
+groups (often minorities) contribute less to the objective and thus suffer from
+systematically higher errors. In this paper, we aim to tailor graph generation
+to downstream mining tasks by leveraging label information and user-preferred
+parity constraint. In particular, we start from the investigation of
+representation disparity in the context of graph generative models. To mitigate
+the disparity, we propose a fairness-aware graph generative model named
+FairGen. Our model jointly trains a label-informed graph generation module and
+a fair representation learning module by progressively learning the behaviors
+of the protected and unprotected groups, from the `easy' concepts to the `hard'
+ones. In addition, we propose a generic context sampling strategy for graph
+generative models, which is proven to be capable of fairly capturing the
+contextual information of each group with a high probability. Experimental
+results on seven real-world data sets, including web-based graphs, demonstrate
+that FairGen (1) obtains performance on par with state-of-the-art graph
+generative models across six network properties, (2) mitigates the
+representation disparity issues in the generated graphs, and (3) substantially
+boosts the model performance by up to 17% in downstream tasks via data
+augmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Linear CNNs Discover the Statistical Structure of the <span class="highlight-title">Dataset</span> Using Only
+  the Most Dominant Frequencies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.02034v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.02034v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hannah Pinson, Joeri Lenaerts, Vincent Ginis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We here present a stepping stone towards a deeper understanding of
+convolutional neural networks (CNNs) in the form of a theory of learning in
+linear CNNs. Through analyzing the gradient descent equations, we discover that
+the evolution of the network during training is determined by the interplay
+between the dataset structure and the convolutional network structure. We show
+that linear CNNs discover the statistical structure of the dataset with
+non-linear, ordered, stage-like transitions, and that the speed of discovery
+changes depending on the relationship between the dataset and the convolutional
+network structure. Moreover, we find that this interplay lies at the heart of
+what we call the ``dominant frequency bias'', where linear CNNs arrive at these
+discoveries using only the dominant frequencies of the different structural
+parts present in the dataset. We furthermore provide experiments that show how
+our theory relates to deep, non-linear CNNs used in practice. Our findings shed
+new light on the inner working of CNNs, and can help explain their shortcut
+learning and their tendency to rely on texture instead of shape.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Productive Crop Field Detection: A New <span class="highlight-title">Dataset</span> and Deep Learning
+  Benchmark Results 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11990v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11990v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduardo Nascimento, John Just, Jurandy Almeida, Tiago Almeida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In precision agriculture, detecting productive crop fields is an essential
+practice that allows the farmer to evaluate operating performance separately
+and compare different seed varieties, pesticides, and fertilizers. However,
+manually identifying productive fields is often a time-consuming and
+error-prone task. Previous studies explore different methods to detect crop
+fields using advanced machine learning algorithms, but they often lack good
+quality labeled data. In this context, we propose a high-quality dataset
+generated by machine operation combined with Sentinel-2 images tracked over
+time. As far as we know, it is the first one to overcome the lack of labeled
+samples by using this technique. In sequence, we apply a semi-supervised
+classification of unlabeled data and state-of-the-art supervised and
+self-supervised deep learning methods to detect productive crop fields
+automatically. Finally, the results demonstrate high accuracy in Positive
+Unlabeled learning, which perfectly fits the problem where we have high
+confidence in the positive samples. Best performances have been found in
+Triplet Loss Siamese given the existence of an accurate dataset and Contrastive
+Learning considering situations where we do not have a comprehensive labeled
+dataset available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of the paper https://doi.org/10.1109/lgrs.2023.3296064
+  published in IEEE Geoscience and Remote Sensing Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploiting the Partly Scratch-off Lottery Ticket for Quantization-Aware
+  Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08544v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08544v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunshan Zhong, Gongrui Nan, Yuxin Zhang, Fei Chao, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantization-aware training (QAT) receives extensive popularity as it well
+retains the performance of quantized networks. In QAT, the contemporary
+experience is that all quantized weights are updated for an entire training
+process. In this paper, this experience is challenged based on an interesting
+phenomenon we observed. Specifically, a large portion of quantized weights
+reaches the optimal quantization level after a few training epochs, which we
+refer to as the partly scratch-off lottery ticket. This
+straightforward-yet-valuable observation naturally inspires us to zero out
+gradient calculations of these weights in the remaining training period to
+avoid meaningless updating. To effectively find the ticket, we develop a
+heuristic method, dubbed lottery ticket scratcher (LTS), which freezes a weight
+once the distance between the full-precision one and its quantization level is
+smaller than a controllable threshold. Surprisingly, the proposed LTS typically
+eliminates 50%-70% weight updating and 25%-35% FLOPs of the backward pass,
+while still resulting on par with or even better performance than the compared
+baseline. For example, compared with the baseline, LTS improves 2-bit
+MobileNetV2 by 5.05%, eliminating 46% weight updating and 23% FLOPs of the
+backward pass. Code is at url{https://github.com/zysxmu/LTS}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Importance of Aligning Training Strategy with Evaluation for Diffusion
+  Models in 3D Multiclass Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06040v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06040v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunguan Fu, Yiwen Li, Shaheer U. Saeed, Matthew J. Clarkson, Yipeng Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, denoising diffusion probabilistic models (DDPM) have been applied
+to image segmentation by generating segmentation masks conditioned on images,
+while the applications were mainly limited to 2D networks without exploiting
+potential benefits from the 3D formulation. In this work, we studied the
+DDPM-based segmentation model for 3D multiclass segmentation on two large
+multiclass data sets (prostate MR and abdominal CT). We observed that the
+difference between training and test methods led to inferior performance for
+existing DDPM methods. To mitigate the inconsistency, we proposed a recycling
+method which generated corrupted masks based on the model's prediction at a
+previous time step instead of using ground truth. The proposed method achieved
+statistically significantly improved performance compared to existing DDPMs,
+independent of a number of other techniques for reducing train-test
+discrepancy, including performing mask prediction, using Dice loss, and
+reducing the number of diffusion time steps during training. The performance of
+diffusion models was also competitive and visually similar to
+non-diffusion-based U-net, within the same compute budget. The JAX-based
+diffusion framework has been released at
+https://github.com/mathpluscode/ImgX-DiffSeg.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Deep Generative Models workshop at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantifying & Modeling Multimodal Interactions: An Information
+  Decomposition Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12247v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12247v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Pu Liang, Yun Cheng, Xiang Fan, Chun Kai Ling, Suzanne Nie, Richard Chen, Zihao Deng, Nicholas Allen, Randy Auerbach, Faisal Mahmood, Ruslan Salakhutdinov, Louis-Philippe Morency
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent explosion of interest in multimodal applications has resulted in a
+wide selection of datasets and methods for representing and integrating
+information from different modalities. Despite these empirical advances, there
+remain fundamental research questions: How can we quantify the interactions
+that are necessary to solve a multimodal task? Subsequently, what are the most
+suitable multimodal models to capture these interactions? To answer these
+questions, we propose an information-theoretic approach to quantify the degree
+of redundancy, uniqueness, and synergy relating input modalities with an output
+task. We term these three measures as the PID statistics of a multimodal
+distribution (or PID for short), and introduce two new estimators for these PID
+statistics that scale to high-dimensional distributions. To validate PID
+estimation, we conduct extensive experiments on both synthetic datasets where
+the PID is known and on large-scale multimodal benchmarks where PID estimations
+are compared with human annotations. Finally, we demonstrate their usefulness
+in (1) quantifying interactions within multimodal datasets, (2) quantifying
+interactions captured by multimodal models, (3) principled approaches for model
+selection, and (4) three real-world case studies engaging with domain experts
+in pathology, mood prediction, and robotic perception where our framework helps
+to recommend strong multimodal models for each application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at: https://github.com/pliang279/PID</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VOCALExplore: Pay-as-You-Go Video Data Exploration and Model Building
+  [Technical Report] 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04068v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04068v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maureen Daum, Enhao Zhang, Dong He, Stephen Mussmann, Brandon Haynes, Ranjay Krishna, Magdalena Balazinska
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce VOCALExplore, a system designed to support users in building
+domain-specific models over video datasets. VOCALExplore supports interactive
+labeling sessions and trains models using user-supplied labels. VOCALExplore
+maximizes model quality by automatically deciding how to select samples based
+on observed skew in the collected labels. It also selects the optimal video
+representations to use when training models by casting feature selection as a
+rising bandit problem. Finally, VOCALExplore implements optimizations to
+achieve low latency without sacrificing model performance. We demonstrate that
+VOCALExplore achieves close to the best possible model quality given candidate
+acquisition functions and feature extractors, and it does so with low visible
+latency (~1 second per iteration) and no expensive preprocessing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SwiftFormer: Efficient Additive Attention for <span class="highlight-title">Transformer</span>-based
+  Real-time Mobile Vision Applications <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15446v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15446v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-attention has become a defacto choice for capturing global context in
+various vision applications. However, its quadratic computational complexity
+with respect to image resolution limits its use in real-time applications,
+especially for deployment on resource-constrained mobile devices. Although
+hybrid approaches have been proposed to combine the advantages of convolutions
+and self-attention for a better speed-accuracy trade-off, the expensive matrix
+multiplication operations in self-attention remain a bottleneck. In this work,
+we introduce a novel efficient additive attention mechanism that effectively
+replaces the quadratic matrix multiplication operations with linear
+element-wise multiplications. Our design shows that the key-value interaction
+can be replaced with a linear layer without sacrificing any accuracy. Unlike
+previous state-of-the-art methods, our efficient formulation of self-attention
+enables its usage at all stages of the network. Using our proposed efficient
+additive attention, we build a series of models called "SwiftFormer" which
+achieves state-of-the-art performance in terms of both accuracy and mobile
+inference speed. Our small variant achieves 78.5% top-1 ImageNet-1K accuracy
+with only 0.8 ms latency on iPhone 14, which is more accurate and 2x faster
+compared to MobileViT-v2. Code: https://github.com/Amshaker/SwiftFormer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">11</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Mainstream Bias in Recommendation via Cost-sensitive Learning <span class="chip">ICTIR'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13632v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13632v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roger Zhe Li, Julián Urbano, Alan Hanjalic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mainstream bias, where some users receive poor recommendations because their
+preferences are uncommon or simply because they are less active, is an
+important aspect to consider regarding fairness in recommender systems.
+Existing methods to mitigate mainstream bias do not explicitly model the
+importance of these non-mainstream users or, when they do, it is in a way that
+is not necessarily compatible with the data and recommendation model at hand.
+In contrast, we use the recommendation utility as a more generic and implicit
+proxy to quantify mainstreamness, and propose a simple user-weighting approach
+to incorporate it into the training process while taking the cost of potential
+recommendation errors into account. We provide extensive experimental results
+showing that quantifying mainstreamness via utility is better able at
+identifying non-mainstream users, and that they are indeed better served when
+training the model in a cost-sensitive way. This is achieved with negligible or
+no loss in overall recommendation accuracy, meaning that the models learn a
+better balance across users. In addition, we show that research of this kind,
+which evaluates recommendation quality at the individual user level, may not be
+reliable if not using enough interactions when assessing model performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures, accepted to ICTIR'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gaussian Graph with Prototypical Contrastive Learning in E-Commerce
+  Bundle Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13468v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13468v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhao-Yang Liu, Liucheng Sun, Chenwei Weng, Qijin Chen, Chengfu Huo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bundle recommendation aims to provide a bundle of items to satisfy the user
+preference on e-commerce platform. Existing successful solutions are based on
+the contrastive graph learning paradigm where graph neural networks (GNNs) are
+employed to learn representations from user-level and bundle-level graph views
+with a contrastive learning module to enhance the cooperative association
+between different views. Nevertheless, they ignore the uncertainty issue which
+has a significant impact in real bundle recommendation scenarios due to the
+lack of discriminative information caused by highly sparsity or diversity. We
+further suggest that their instancewise contrastive learning fails to
+distinguish the semantically similar negatives (i.e., sampling bias issue),
+resulting in performance degradation. In this paper, we propose a novel
+Gaussian Graph with Prototypical Contrastive Learning (GPCL) framework to
+overcome these challenges. In particular, GPCL embeds each user/bundle/item as
+a Gaussian distribution rather than a fixed vector. We further design a
+prototypical contrastive learning module to capture the contextual information
+and mitigate the sampling bias issue. Extensive experiments demonstrate that
+benefiting from the proposed components, we achieve new state-of-the-art
+performance compared to previous methods on several public datasets. Moreover,
+GPCL has been deployed on real-world e-commerce platform and achieved
+substantial improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comprehensive <span class="highlight-title">Review</span> on Semantic Information Retrieval and Ontology
+  Engineering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13427v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13427v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sumit Sharma, Sarika Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Situation awareness is a crucial cognitive skill that enables individuals to
+perceive, comprehend, and project the current state of their environment
+accurately. It involves being conscious of relevant information, understanding
+its meaning, and using that understanding to make well-informed decisions.
+Awareness systems often need to integrate new knowledge and adapt to changing
+environments. Ontology reasoning facilitates knowledge integration and
+evolution, allowing for seamless updates and expansions of the ontology. With
+the consideration of above, we are providing a quick review on semantic
+information retrieval and ontology engineering to understand the emerging
+challenges and future research. In the review we have found that the ontology
+reasoning addresses the limitations of traditional systems by providing a
+formal, flexible, and scalable framework for knowledge representation,
+reasoning, and inference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An End-to-End Workflow using Topic Segmentation and Text Summarisation
+  Methods for Improved Podcast Comprehension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13394v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13394v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Aquilina, Sean Diacono, Panagiotis Papapetrou, Maria Movin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The consumption of podcast media has been increasing rapidly. Due to the
+lengthy nature of podcast episodes, users often carefully select which ones to
+listen to. Although episode descriptions aid users by providing a summary of
+the entire podcast, they do not provide a topic-by-topic breakdown. This study
+explores the combined application of topic segmentation and text summarisation
+methods to investigate how podcast episode comprehension can be improved. We
+have sampled 10 episodes from Spotify's English-Language Podcast Dataset and
+employed TextTiling and TextSplit to segment them. Moreover, three text
+summarisation models, namely T5, BART, and Pegasus, were applied to provide a
+very short title for each segment. The segmentation part was evaluated using
+our annotated sample with the $P_k$ and WindowDiff ($WD$) metrics. A survey was
+also rolled out ($N=25$) to assess the quality of the generated summaries. The
+TextSplit algorithm achieved the lowest mean for both evaluation metrics
+($\bar{P_k}=0.41$ and $\bar{WD}=0.41$), while the T5 model produced the best
+summaries, achieving a relevancy score only $8\%$ less to the one achieved by
+the human-written titles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Embedding Models for Supervised Automatic Extraction and Classification
+  of Named Entities in Scientific Acknowledgements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13377v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13377v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nina Smirnova, Philipp Mayr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Acknowledgments in scientific papers may give an insight into aspects of the
+scientific community, such as reward systems, collaboration patterns, and
+hidden research trends. The aim of the paper is to evaluate the performance of
+different embedding models for the task of automatic extraction and
+classification of acknowledged entities from the acknowledgment text in
+scientific papers. We trained and implemented a named entity recognition (NER)
+task using the Flair NLP framework. The training was conducted using three
+default Flair NER models with four differently-sized corpora and different
+versions of the Flair NLP framework. The Flair Embeddings model trained on the
+medium corpus with the latest FLAIR version showed the best accuracy of 0.79.
+Expanding the size of a training corpus from very small to medium size
+massively increased the accuracy of all training algorithms, but further
+expansion of the training corpus did not bring further improvement. Moreover,
+the performance of the model slightly deteriorated. Our model is able to
+recognize six entity types: funding agency, grant number, individuals,
+university, corporation, and miscellaneous. The model works more precisely for
+some entity types than for others; thus, individuals and grant numbers showed a
+very good F1-Score over 0.9. Most of the previous works on acknowledgment
+analysis were limited by the manual evaluation of data and therefore by the
+amount of processed data. This model can be applied for the comprehensive
+analysis of acknowledgment texts and may potentially make a great contribution
+to the field of automated acknowledgment analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The present paper is an extended version of the article Evaluation of
+  Embedding Models for Automatic Extraction and Classification of Acknowledged
+  Entities in Scientific Documents (Smirnova and Mayr, 2022) presented at the
+  3rd Workshop on Extraction and Evaluation of Knowledge Entities from
+  Scientific Documents (EEKE2022). arXiv admin note: substantial text overlap
+  with arXiv:2206.10939</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Intent Taxonomy of Legal Case Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunqiu Shao, Haitao Li, Yueyue Wu, Yiqun Liu, Qingyao Ai, Jiaxin Mao, Yixiao Ma, Shaoping Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Legal case retrieval is a special Information Retrieval~(IR) task focusing on
+legal case documents. Depending on the downstream tasks of the retrieved case
+documents, users' information needs in legal case retrieval could be
+significantly different from those in Web search and traditional ad-hoc
+retrieval tasks. While there are several studies that retrieve legal cases
+based on text similarity, the underlying search intents of legal retrieval
+users, as shown in this paper, are more complicated than that yet mostly
+unexplored. To this end, we present a novel hierarchical intent taxonomy of
+legal case retrieval. It consists of five intent types categorized by three
+criteria, i.e., search for Particular Case(s), Characterization, Penalty,
+Procedure, and Interest. The taxonomy was constructed transparently and
+evaluated extensively through interviews, editorial user studies, and query log
+analysis. Through a laboratory user study, we reveal significant differences in
+user behavior and satisfaction under different search intents in legal case
+retrieval. Furthermore, we apply the proposed taxonomy to various downstream
+legal retrieval tasks, e.g., result ranking and satisfaction prediction, and
+demonstrate its effectiveness. Our work provides important insights into the
+understanding of user intents in legal case retrieval and potentially leads to
+better retrieval techniques in the legal domain, such as intent-aware ranking
+strategies and evaluation methodologies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, work in process</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ClusterSeq: Enhancing Sequential Recommender Systems with Clustering
+  based Meta-Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13766v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13766v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammmadmahdi Maheri, Reza Abdollahzadeh, Bardia Mohammadi, Mina Rafiei, Jafar Habibi, Hamid R. Rabiee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In practical scenarios, the effectiveness of sequential recommendation
+systems is hindered by the user cold-start problem, which arises due to limited
+interactions for accurately determining user preferences. Previous studies have
+attempted to address this issue by combining meta-learning with user and
+item-side information. However, these approaches face inherent challenges in
+modeling user preference dynamics, particularly for "minor users" who exhibit
+distinct preferences compared to more common or "major users." To overcome
+these limitations, we present a novel approach called ClusterSeq, a
+Meta-Learning Clustering-Based Sequential Recommender System. ClusterSeq
+leverages dynamic information in the user sequence to enhance item prediction
+accuracy, even in the absence of side information. This model preserves the
+preferences of minor users without being overshadowed by major users, and it
+capitalizes on the collective knowledge of users within the same cluster.
+Extensive experiments conducted on various benchmark datasets validate the
+effectiveness of ClusterSeq. Empirical results consistently demonstrate that
+ClusterSeq outperforms several state-of-the-art meta-learning recommenders.
+Notably, compared to existing meta-learning methods, our proposed approach
+achieves a substantial improvement of 16-39% in Mean Reciprocal Rank (MRR).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Soft <span class="highlight-title">Prompt</span> Tuning for Augmenting Dense Retrieval with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08303v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08303v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Peng, Xuyang Wu, Yi Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dense retrieval (DR) converts queries and documents into dense embeddings and
+measures the similarity between queries and documents in vector space. One of
+the challenges in DR is the lack of domain-specific training data. While DR
+models can learn from large-scale public datasets like MS MARCO through
+transfer learning, evidence shows that not all DR models and domains can
+benefit from transfer learning equally. Recently, some researchers have
+resorted to large language models (LLMs) to improve the zero-shot and few-shot
+DR models. However, the hard prompts or human-written prompts utilized in these
+works cannot guarantee the good quality of generated weak queries. To tackle
+this, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,
+we leverage soft prompt-tuning to optimize a task-specific soft prompt on
+limited ground truth data and then prompt the LLMs to tag unlabeled documents
+with weak queries, yielding enough weak document-query pairs to train
+task-specific dense retrievers. We design a filter to select high-quality
+example document-query pairs in the prompt to further improve the quality of
+weak tagged queries. To the best of our knowledge, there is no prior work
+utilizing soft prompt tuning to augment DR models. The experiments demonstrate
+that SPTAR outperforms the unsupervised baselines BM25 and the recently
+proposed LLMs-based augmentation method for DR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>fix typo InPairs which should be InPars</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Criterion-based Heterogeneous Collaborative Filtering for Multi-behavior
+  Implicit Recommendation <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.11876v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.11876v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Luo, Daqing Wu, Yiyang Gu, Chong Chen, Luchen Liu, Jinwen Ma, Ming Zhang, Minghua Deng, Jianqiang Huang, Xian-Sheng Hua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed the explosive growth of interaction behaviors in
+multimedia information systems, where multi-behavior recommender systems have
+received increasing attention by leveraging data from various auxiliary
+behaviors such as tip and collect. Among various multi-behavior recommendation
+methods, non-sampling methods have shown superiority over negative sampling
+methods. However, two observations are usually ignored in existing
+state-of-the-art non-sampling methods based on binary regression: (1) users
+have different preference strengths for different items, so they cannot be
+measured simply by binary implicit data; (2) the dependency across multiple
+behaviors varies for different users and items. To tackle the above issue, we
+propose a novel non-sampling learning framework named Criterion-guided
+Heterogeneous Collaborative Filtering (CHCF). CHCF introduces both upper and
+lower thresholds to indicate selection criteria, which will guide user
+preference learning. Besides, CHCF integrates criterion learning and user
+preference learning into a unified framework, which can be trained jointly for
+the interaction prediction of the target behavior. We further theoretically
+demonstrate that the optimization of Collaborative Metric Learning can be
+approximately achieved by the CHCF learning framework in a non-sampling form
+effectively. Extensive experiments on three real-world datasets show the
+effectiveness of CHCF in heterogeneous scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM Transactions on Knowledge Discovery from Data (TKDD)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RRAML: Reinforced Retrieval Augmented Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12798v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12798v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Bacciu, Florin Cocunasu, Federico Siciliano, Fabrizio Silvestri, Nicola Tonellotto, Giovanni Trappolini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large language models (LLMs) has revolutionized machine
+learning and related fields, showcasing remarkable abilities in comprehending,
+generating, and manipulating human language. However, their conventional usage
+through API-based text prompt submissions imposes certain limitations in terms
+of context constraints and external source availability. To address these
+challenges, we propose a novel framework called Reinforced Retrieval Augmented
+Machine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs
+with supporting information retrieved by a purpose-built retriever from a vast
+user-provided database. By leveraging recent advancements in reinforcement
+learning, our method effectively addresses several critical challenges.
+Firstly, it circumvents the need for accessing LLM gradients. Secondly, our
+method alleviates the burden of retraining LLMs for specific tasks, as it is
+often impractical or impossible due to restricted access to the model and the
+computational intensity involved. Additionally we seamlessly link the
+retriever's task with the reasoner, mitigating hallucinations and reducing
+irrelevant, and potentially damaging retrieved documents. We believe that the
+research agenda outlined in this paper has the potential to profoundly impact
+the field of AI, democratizing access to and utilization of LLMs for a wide
+range of entities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interface Design to Mitigate Inflation in Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12424v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12424v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rana Shahout, Yehonatan Peisakhovsky, Sasha Stoikov, Nikhil Garg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommendation systems rely on user-provided data to learn about item quality
+and provide personalized recommendations. An implicit assumption when
+aggregating ratings into item quality is that ratings are strong indicators of
+item quality. In this work, we test this assumption using data collected from a
+music discovery application. Our study focuses on two factors that cause rating
+inflation: heterogeneous user rating behavior and the dynamics of personalized
+recommendations. We show that user rating behavior substantially varies by
+user, leading to item quality estimates that reflect the users who rated an
+item more than the item quality itself. Additionally, items that are more
+likely to be shown via personalized recommendations can experience a
+substantial increase in their exposure and potential bias toward them. To
+mitigate these effects, we analyze the results of a randomized controlled trial
+in which the rating interface was modified. The test resulted in a substantial
+improvement in user rating behavior and a reduction in item quality inflation.
+These findings highlight the importance of carefully considering the
+assumptions underlying recommendation systems and designing interfaces that
+encourage accurate rating behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">135</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ARB: Advanced Reasoning Benchmark for Large Language Models <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13692v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13692v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomohiro Sawada, Daniel Paleka, Alexander Havrilla, Pranav Tadepalli, Paula Vidas, Alexander Kranias, John J. Nay, Kshitij Gupta, Aran Komatsuzaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable performance on
+various quantitative reasoning and knowledge benchmarks. However, many of these
+benchmarks are losing utility as LLMs get increasingly high scores, despite not
+yet reaching expert performance in these domains. We introduce ARB, a novel
+benchmark composed of advanced reasoning problems in multiple fields. ARB
+presents a more challenging test than prior benchmarks, featuring problems in
+mathematics, physics, biology, chemistry, and law. As a subset of ARB, we
+introduce a challenging set of math and physics problems which require advanced
+symbolic reasoning and domain knowledge. We evaluate recent models such as
+GPT-4 and Claude on ARB and demonstrate that current models score well below
+50% on more demanding tasks. In order to improve both automatic and assisted
+evaluation capabilities, we introduce a rubric-based evaluation approach,
+allowing GPT-4 to score its own intermediate reasoning steps. Further, we
+conduct a human evaluation of the symbolic subset of ARB, finding promising
+agreement between annotators and GPT-4 rubric evaluation scores.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to NeurIPS Datasets and Benchmarks Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High Probability Analysis for Non-Convex Stochastic Optimization with
+  Clipping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaojie Li, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient clipping is a commonly used technique to stabilize the training
+process of neural networks. A growing body of studies has shown that gradient
+clipping is a promising technique for dealing with the heavy-tailed behavior
+that emerged in stochastic optimization as well. While gradient clipping is
+significant, its theoretical guarantees are scarce. Most theoretical guarantees
+only provide an in-expectation analysis and only focus on optimization
+performance. In this paper, we provide high probability analysis in the
+non-convex setting and derive the optimization bound and the generalization
+bound simultaneously for popular stochastic optimization algorithms with
+gradient clipping, including stochastic gradient descent and its variants of
+momentum and adaptive stepsizes. With the gradient clipping, we study a
+heavy-tailed assumption that the gradients only have bounded $\alpha$-th
+moments for some $\alpha \in (1, 2]$, which is much weaker than the standard
+bounded second-moment assumption. Overall, our study provides a relatively
+complete picture for the theoretical guarantee of stochastic optimization
+algorithms with clipping.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RED CoMETS: An ensemble classifier for symbolically represented
+  multivariate time series <span class="chip">ALT</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13679v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13679v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca A. Bennett, Zahraa S. Abdallah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multivariate time series classification is a rapidly growing research field
+with practical applications in finance, healthcare, engineering, and more. The
+complexity of classifying multivariate time series data arises from its high
+dimensionality, temporal dependencies, and varying lengths. This paper
+introduces a novel ensemble classifier called RED CoMETS (Random Enhanced
+Co-eye for Multivariate Time Series), which addresses these challenges. RED
+CoMETS builds upon the success of Co-eye, an ensemble classifier specifically
+designed for symbolically represented univariate time series, and extends its
+capabilities to handle multivariate data. The performance of RED CoMETS is
+evaluated on benchmark datasets from the UCR archive, where it demonstrates
+competitive accuracy when compared to state-of-the-art techniques in
+multivariate settings. Notably, it achieves the highest reported accuracy in
+the literature for the 'HandMovementDirection' dataset. Moreover, the proposed
+method significantly reduces computation time compared to Co-eye, making it an
+efficient and effective choice for multivariate time series classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AALTD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards an AI Accountability Policy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13658v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13658v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Przemyslaw Grabowicz, Nicholas Perello, Yair Zick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This white paper is a response to the "AI Accountability Policy Request for
+Comments" by the National Telecommunications and Information Administration of
+the United States. The question numbers for which comments were requested are
+provided in superscripts at the end of key sentences answering the respective
+questions. The white paper offers a set of interconnected recommendations for
+an AI accountability policy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safety Margins for Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13642v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13642v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Grushin, Walt Woods, Alvaro Velasquez, Simon Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Any autonomous controller will be unsafe in some situations. The ability to
+quantitatively identify when these unsafe situations are about to occur is
+crucial for drawing timely human oversight in, e.g., freight transportation
+applications. In this work, we demonstrate that the true criticality of an
+agent's situation can be robustly defined as the mean reduction in reward given
+some number of random actions. Proxy criticality metrics that are computable in
+real-time (i.e., without actually simulating the effects of random actions) can
+be compared to the true criticality, and we show how to leverage these proxy
+metrics to generate safety margins, which directly tie the consequences of
+potentially incorrect actions to an anticipated loss in overall performance. We
+evaluate our approach on learned policies from APE-X and A3C within an Atari
+environment, and demonstrate how safety margins decrease as agents approach
+failure states. The integration of safety margins into programs for monitoring
+deployed agents allows for the real-time identification of potentially
+catastrophic situations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2 pages, 2 figures. Presented at the 2023 IEEE Conference on
+  Artificial Intelligence (CAI), Santa Clara, CA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling machine learning-based chemical plant simulation: A method for
+  fine-tuning a model to induce stable fixed points 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13621v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13621v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Malte Esders, Gimmy Alex Fernandez Ramirez, Michael Gastegger, Satya Swarup Samal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Idealized first-principles models of chemical plants can be inaccurate. An
+alternative is to fit a Machine Learning (ML) model directly to plant sensor
+data. We use a structured approach: Each unit within the plant gets represented
+by one ML model. After fitting the models to the data, the models are connected
+into a flowsheet-like directed graph. We find that for smaller plants, this
+approach works well, but for larger plants, the complex dynamics arising from
+large and nested cycles in the flowsheet lead to instabilities in the cycle
+solver. We analyze this problem in depth and show that it is not merely a
+specialized concern but rather a more pervasive challenge that will likely
+occur whenever ML is applied to larger plants. To address this problem, we
+present a way to fine-tune ML models such that solving cycles with the usual
+methods becomes robust again.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI and ethics in insurance: a new solution to mitigate proxy
+  discrimination in risk modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13616v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13616v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marguerite Sauce, Antoine Chancel, Antoine Ly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of Machine Learning is experiencing growing interest from the
+general public, and in recent years there have been numerous press articles
+questioning its objectivity: racism, sexism, \dots Driven by the growing
+attention of regulators on the ethical use of data in insurance, the actuarial
+community must rethink pricing and risk selection practices for fairer
+insurance. Equity is a philosophy concept that has many different definitions
+in every jurisdiction that influence each other without currently reaching
+consensus. In Europe, the Charter of Fundamental Rights defines guidelines on
+discrimination, and the use of sensitive personal data in algorithms is
+regulated. If the simple removal of the protected variables prevents any
+so-called `direct' discrimination, models are still able to `indirectly'
+discriminate between individuals thanks to latent interactions between
+variables, which bring better performance (and therefore a better
+quantification of risk, segmentation of prices, and so on). After introducing
+the key concepts related to discrimination, we illustrate the complexity of
+quantifying them. We then propose an innovative method, not yet met in the
+literature, to reduce the risks of indirect discrimination thanks to
+mathematical concepts of linear algebra. This technique is illustrated in a
+concrete case of risk selection in life insurance, demonstrating its simplicity
+of use and its promising performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint - WIP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-GPU Approach for Training of Graph ML Models on large CFD Meshes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13592v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13592v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Strönisch, Maximilian Sander, Andreas Knüpfer, Marcus Meyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mesh-based numerical solvers are an important part in many design tool
+chains. However, accurate simulations like computational fluid dynamics are
+time and resource consuming which is why surrogate models are employed to
+speed-up the solution process. Machine Learning based surrogate models on the
+other hand are fast in predicting approximate solutions but often lack
+accuracy. Thus, the development of the predictor in a predictor-corrector
+approach is the focus here, where the surrogate model predicts a flow field and
+the numerical solver corrects it. This paper scales a state-of-the-art
+surrogate model from the domain of graph-based machine learning to
+industry-relevant mesh sizes of a numerical flow simulation. The approach
+partitions and distributes the flow domain to multiple GPUs and provides halo
+exchange between these partitions during training. The utilized graph neural
+network operates directly on the numerical mesh and is able to preserve complex
+geometries as well as all other properties of the mesh. The proposed surrogate
+model is evaluated with an application on a three dimensional turbomachinery
+setup and compared to a traditionally trained distributed model. The results
+show that the traditional approach produces superior predictions and
+outperforms the proposed surrogate model. Possible explanations, improvements
+and future directions are outlined.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Settling the Sample Complexity of Online Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Zhang, Yuxin Chen, Jason D. Lee, Simon S. Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A central issue lying at the heart of online reinforcement learning (RL) is
+data efficiency. While a number of recent works achieved asymptotically minimal
+regret in online RL, the optimality of these results is only guaranteed in a
+``large-sample'' regime, imposing enormous burn-in cost in order for their
+algorithms to operate optimally. How to achieve minimax-optimal regret without
+incurring any burn-in cost has been an open problem in RL theory.
+  We settle this problem for the context of finite-horizon inhomogeneous Markov
+decision processes. Specifically, we prove that a modified version of Monotonic
+Value Propagation (MVP), a model-based algorithm proposed by
+\cite{zhang2020reinforcement}, achieves a regret on the order of (modulo log
+factors) \begin{equation*}
+  \min\big\{ \sqrt{SAH^3K}, \,HK \big\}, \end{equation*} where $S$ is the
+number of states, $A$ is the number of actions, $H$ is the planning horizon,
+and $K$ is the total number of episodes. This regret matches the minimax lower
+bound for the entire range of sample size $K\geq 1$, essentially eliminating
+any burn-in requirement. It also translates to a PAC sample complexity (i.e.,
+the number of episodes needed to yield $\varepsilon$-accuracy) of
+$\frac{SAH^3}{\varepsilon^2}$ up to log factor, which is minimax-optimal for
+the full $\varepsilon$-range.
+  Further, we extend our theory to unveil the influences of problem-dependent
+quantities like the optimal value/cost and certain variances. The key technical
+innovation lies in the development of a new regret decomposition strategy and a
+novel analysis paradigm to decouple complicated statistical dependency -- a
+long-standing challenge facing the analysis of online RL in the sample-hungry
+regime.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparing Forward and Inverse Design Paradigms: A Case Study on
+  Refractory High-Entropy Alloys 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13581v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13581v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arindam Debnath, Lavanya Raman, Wenjie Li, Adam M. Krajewski, Marcia Ahn, Shuang Lin, Shunli Shang, Allison M. Beese, Zi-Kui Liu, Wesley F. Reinhart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid design of advanced materials is a topic of great scientific
+interest. The conventional, ``forward'' paradigm of materials design involves
+evaluating multiple candidates to determine the best candidate that matches the
+target properties. However, recent advances in the field of deep learning have
+given rise to the possibility of an ``inverse'' design paradigm for advanced
+materials, wherein a model provided with the target properties is able to find
+the best candidate. Being a relatively new concept, there remains a need to
+systematically evaluate how these two paradigms perform in practical
+applications. Therefore, the objective of this study is to directly,
+quantitatively compare the forward and inverse design modeling paradigms. We do
+so by considering two case studies of refractory high-entropy alloy design with
+different objectives and constraints and comparing the inverse design method to
+other forward schemes like localized forward search, high throughput screening,
+and multi objective optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinterpreting survival analysis in the universal approximator age 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13579v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13579v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sören Dittmer, Michael Roberts, Jacobus Preller, AIX COVNET, James H. F. Rudd, John A. D. Aston, Carola-Bibiane Schönlieb
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Survival analysis is an integral part of the statistical toolbox. However,
+while most domains of classical statistics have embraced deep learning,
+survival analysis only recently gained some minor attention from the deep
+learning community. This recent development is likely in part motivated by the
+COVID-19 pandemic. We aim to provide the tools needed to fully harness the
+potential of survival analysis in deep learning. On the one hand, we discuss
+how survival analysis connects to classification and regression. On the other
+hand, we provide technical tools. We provide a new loss function, evaluation
+metrics, and the first universal approximating network that provably produces
+survival curves without numeric integration. We show that the loss function and
+model outperform other approaches using a large numerical study.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PT$\mathrm{L}^{p}$: Partial Transport $\mathrm{L}^{p}$ Distances 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13571v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13571v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinran Liu, Yikun Bai, Huy Tran, Zhanqi Zhu, Matthew Thorpe, Soheil Kolouri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimal transport and its related problems, including optimal partial
+transport, have proven to be valuable tools in machine learning for computing
+meaningful distances between probability or positive measures. This success has
+led to a growing interest in defining transport-based distances that allow for
+comparing signed measures and, more generally, multi-channeled signals.
+Transport $\mathrm{L}^{p}$ distances are notable extensions of the optimal
+transport framework to signed and possibly multi-channeled signals. In this
+paper, we introduce partial transport $\mathrm{L}^{p}$ distances as a new
+family of metrics for comparing generic signals, benefiting from the robustness
+of partial transport distances. We provide theoretical background such as the
+existence of optimal plans and the behavior of the distance in various limits.
+Furthermore, we introduce the sliced variation of these distances, which allows
+for rapid comparison of generic signals. Finally, we demonstrate the
+application of the proposed distances in signal class separability and nearest
+neighbor classification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decision-Focused Learning: Foundations, State of the Art, Benchmark and
+  Future Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13565v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13565v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jayanta Mandi, James Kotary, Senne Berden, Maxime Mulamba, Victor Bucarey, Tias Guns, Ferdinando Fioretto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decision-focused learning (DFL) is an emerging paradigm in machine learning
+which trains a model to optimize decisions, integrating prediction and
+optimization in an end-to-end system. This paradigm holds the promise to
+revolutionize decision-making in many real-world applications which operate
+under uncertainty, where the estimation of unknown parameters within these
+decision models often becomes a substantial roadblock. This paper presents a
+comprehensive review of DFL. It provides an in-depth analysis of the various
+techniques devised to integrate machine learning and optimization models
+introduces a taxonomy of DFL methods distinguished by their unique
+characteristics, and conducts an extensive empirical evaluation of these
+methods proposing suitable benchmark dataset and tasks for DFL. Finally, the
+study provides valuable insights into current and potential future avenues in
+DFL research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Experimental Survey and Benchmarking</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Node Injection Link Stealing Attack 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13548v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13548v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oualid Zari, Javier Parra-Arnau, Ayşe Ünsal, Melek Önen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a stealthy and effective attack that exposes
+privacy vulnerabilities in Graph Neural Networks (GNNs) by inferring private
+links within graph-structured data. Focusing on the inductive setting where new
+nodes join the graph and an API is used to query predictions, we investigate
+the potential leakage of private edge information. We also propose methods to
+preserve privacy while maintaining model utility. Our attack demonstrates
+superior performance in inferring the links compared to the state of the art.
+Furthermore, we examine the application of differential privacy (DP) mechanisms
+to mitigate the impact of our proposed attack, we analyze the trade-off between
+privacy preservation and model utility. Our work highlights the privacy
+vulnerabilities inherent in GNNs, underscoring the importance of developing
+robust privacy-preserving mechanisms for their application.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transfer Learning for Portfolio Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13546v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13546v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyang Cao, Haotian Gu, Xin Guo, Mathieu Rosenbaum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we explore the possibility of utilizing transfer learning
+techniques to address the financial portfolio optimization problem. We
+introduce a novel concept called "transfer risk", within the optimization
+framework of transfer learning. A series of numerical experiments are conducted
+from three categories: cross-continent transfer, cross-sector transfer, and
+cross-frequency transfer. In particular, 1. a strong correlation between the
+transfer risk and the overall performance of transfer learning methods is
+established, underscoring the significance of transfer risk as a viable
+indicator of "transferability"; 2. transfer risk is shown to provide a
+computationally efficient way to identify appropriate source tasks in transfer
+learning, enhancing the efficiency and effectiveness of the transfer learning
+approach; 3. additionally, the numerical experiments offer valuable new
+insights for portfolio management across these different settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A model for efficient dynamical ranking in networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13544v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13544v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Della Vecchia, Kibidi Neocosmos, Daniel B. Larremore, Cristopher Moore, Caterina De Bacco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a physics-inspired method for inferring dynamic rankings in
+directed temporal networks - networks in which each directed and timestamped
+edge reflects the outcome and timing of a pairwise interaction. The inferred
+ranking of each node is real-valued and varies in time as each new edge,
+encoding an outcome like a win or loss, raises or lowers the node's estimated
+strength or prestige, as is often observed in real scenarios including
+sequences of games, tournaments, or interactions in animal hierarchies. Our
+method works by solving a linear system of equations and requires only one
+parameter to be tuned. As a result, the corresponding algorithm is scalable and
+efficient. We test our method by evaluating its ability to predict interactions
+(edges' existence) and their outcomes (edges' directions) in a variety of
+applications, including both synthetic and real data. Our analysis shows that
+in many cases our method's performance is better than existing methods for
+predicting dynamic rankings and interaction outcomes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model Calibration in Dense Classification with Adaptive Label
+  Perturbation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13539v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13539v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Liu, Changkun Ye, Shan Wang, Ruikai Cui, Jing Zhang, Kaihao Zhang, Nick Barnes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For safety-related applications, it is crucial to produce trustworthy deep
+neural networks whose prediction is associated with confidence that can
+represent the likelihood of correctness for subsequent decision-making.
+Existing dense binary classification models are prone to being over-confident.
+To improve model calibration, we propose Adaptive Stochastic Label Perturbation
+(ASLP) which learns a unique label perturbation level for each training image.
+ASLP employs our proposed Self-Calibrating Binary Cross Entropy (SC-BCE) loss,
+which unifies label perturbation processes including stochastic approaches
+(like DisturbLabel), and label smoothing, to correct calibration while
+maintaining classification rates. ASLP follows Maximum Entropy Inference of
+classic statistical mechanics to maximise prediction entropy with respect to
+missing information. It performs this while: (1) preserving classification
+accuracy on known data as a conservative solution, or (2) specifically improves
+model calibration degree by minimising the gap between the prediction accuracy
+and expected confidence of the target training label. Extensive results
+demonstrate that ASLP can significantly improve calibration degrees of dense
+binary classification models on both in-distribution and out-of-distribution
+data. The code is available on https://github.com/Carlisle-Liu/ASLP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ INFINITY: Neural Field Modeling for Reynolds-Averaged Navier-Stokes
+  Equations <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13538v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13538v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Louis Serrano, Leon Migus, Yuan Yin, Jocelyn Ahmed Mazari, Patrick Gallinari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For numerical design, the development of efficient and accurate surrogate
+models is paramount. They allow us to approximate complex physical phenomena,
+thereby reducing the computational burden of direct numerical simulations. We
+propose INFINITY, a deep learning model that utilizes implicit neural
+representations (INRs) to address this challenge. Our framework encodes
+geometric information and physical fields into compact representations and
+learns a mapping between them to infer the physical fields. We use an airfoil
+design optimization problem as an example task and we evaluate our approach on
+the challenging AirfRANS dataset, which closely resembles real-world industrial
+use-cases. The experimental results demonstrate that our framework achieves
+state-of-the-art performance by accurately inferring physical fields throughout
+the volume and surface. Additionally we demonstrate its applicability in
+contexts such as design exploration and shape optimization: our model can
+correctly predict drag and lift coefficients while adhering to the equations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023 Workshop on Synergy of Scientific and Machine Learning
+  Modeling</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do algorithms and barriers for sparse principal component analysis
+  extend to other structured settings? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanyi Wang, Mengqi Lou, Ashwin Pananjady
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a principal component analysis problem under the spiked Wishart
+model in which the structure in the signal is captured by a class of
+union-of-subspace models. This general class includes vanilla sparse PCA as
+well as its variants with graph sparsity. With the goal of studying these
+problems under a unified statistical and computational lens, we establish
+fundamental limits that depend on the geometry of the problem instance, and
+show that a natural projected power method exhibits local convergence to the
+statistically near-optimal neighborhood of the solution. We complement these
+results with end-to-end analyses of two important special cases given by path
+and tree sparsity in a general basis, showing initialization methods and
+matching evidence of computational hardness. Overall, our results indicate that
+several of the phenomena observed for vanilla sparse PCA extend in a natural
+fashion to its structured counterparts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Differentiable Turbulence II 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13533v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13533v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Varun Shankar, Romit Maulik, Venkatasubramanian Viswanathan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Differentiable fluid simulators are increasingly demonstrating value as
+useful tools for developing data-driven models in computational fluid dynamics
+(CFD). Differentiable turbulence, or the end-to-end training of machine
+learning (ML) models embedded in CFD solution algorithms, captures both the
+generalization power and limited upfront cost of physics-based simulations, and
+the flexibility and automated training of deep learning methods. We develop a
+framework for integrating deep learning models into a generic finite element
+numerical scheme for solving the Navier-Stokes equations, applying the
+technique to learn a sub-grid scale closure using a multi-scale graph neural
+network. We demonstrate the method on several realizations of flow over a
+backwards-facing step, testing on both unseen Reynolds numbers and new
+geometry. We show that the learned closure can achieve accuracy comparable to
+traditional large eddy simulation on a finer grid that amounts to an equivalent
+speedup of 10x. As the desire and need for cheaper CFD simulations grows, we
+see hybrid physics-ML methods as a path forward to be exploited in the near
+future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Long-Term predictions of Turbulence using Neural Operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13517v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13517v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernando Gonzalez, François-Xavier Demoulin, Simon Bernard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores Neural Operators to predict turbulent flows, focusing on
+the Fourier Neural Operator (FNO) model. It aims to develop
+reduced-order/surrogate models for turbulent flow simulations using Machine
+Learning. Different model configurations are analyzed, with U-NET structures
+(UNO and U-FNET) performing better than the standard FNO in accuracy and
+stability. U-FNET excels in predicting turbulence at higher Reynolds numbers.
+Regularization terms, like gradient and stability losses, are essential for
+stable and accurate predictions. The study emphasizes the need for improved
+metrics for deep learning models in fluid flow prediction. Further research
+should focus on models handling complex flows and practical benchmarking
+metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ETMM14 proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continuous Time Evidential Distributions for Irregular Time Series <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13503v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13503v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taylor W. Killian, Haoran Zhang, Thomas Hartvigsen, Ava P. Amini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prevalent in many real-world settings such as healthcare, irregular time
+series are challenging to formulate predictions from. It is difficult to infer
+the value of a feature at any given time when observations are sporadic, as it
+could take on a range of values depending on when it was last observed. To
+characterize this uncertainty we present EDICT, a strategy that learns an
+evidential distribution over irregular time series in continuous time. This
+distribution enables well-calibrated and flexible inference of partially
+observed features at any time of interest, while expanding uncertainty
+temporally for sparse, irregular observations. We demonstrate that EDICT
+attains competitive performance on challenging time series classification tasks
+and enabling uncertainty-guided inference when encountering noisy data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023 Workshop on Interpretable Machine Learning in Healthcare.
+  Code is available at https://github.com/twkillian/EDICT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Reinforcement Learning for Robust Goal-Based Wealth Management 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tessa Bauman, Bruno Gašperov, Stjepan Begušić, Zvonko Kostanjčar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Goal-based investing is an approach to wealth management that prioritizes
+achieving specific financial goals. It is naturally formulated as a sequential
+decision-making problem as it requires choosing the appropriate investment
+until a goal is achieved. Consequently, reinforcement learning, a machine
+learning technique appropriate for sequential decision-making, offers a
+promising path for optimizing these investment strategies. In this paper, a
+novel approach for robust goal-based wealth management based on deep
+reinforcement learning is proposed. The experimental results indicate its
+superiority over several goal-based wealth management benchmarks on both
+simulated and historical market data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding Money Launderers Using Heterogeneous Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fredrik Johannessen, Martin Jullum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current anti-money laundering (AML) systems, predominantly rule-based,
+exhibit notable shortcomings in efficiently and precisely detecting instances
+of money laundering. As a result, there has been a recent surge toward
+exploring alternative approaches, particularly those utilizing machine
+learning. Since criminals often collaborate in their money laundering
+endeavors, accounting for diverse types of customer relations and links becomes
+crucial. In line with this, the present paper introduces a graph neural network
+(GNN) approach to identify money laundering activities within a large
+heterogeneous network constructed from real-world bank transactions and
+business role data belonging to DNB, Norway's largest bank. Specifically, we
+extend the homogeneous GNN method known as the Message Passing Neural Network
+(MPNN) to operate effectively on a heterogeneous graph. As part of this
+procedure, we propose a novel method for aggregating messages across different
+edges of the graph. Our findings highlight the importance of using an
+appropriate GNN architecture when combining information in heterogeneous
+graphs. The performance results of our model demonstrate great potential in
+enhancing the quality of electronic surveillance systems employed by banks to
+detect instances of money laundering. To the best of our knowledge, this is the
+first published work applying GNN on a large real-world heterogeneous network
+for anti-money laundering purposes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zshot: An Open-source Framework for Zero-Shot Named Entity Recognition
+  and Relation Extraction <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13497v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13497v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriele Picco, Marcos Martínez Galindo, Alberto Purpura, Leopold Fuchs, Vanessa López, Hoang Thanh Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Zero-Shot Learning (ZSL) task pertains to the identification of entities
+or relations in texts that were not seen during training. ZSL has emerged as a
+critical research area due to the scarcity of labeled data in specific domains,
+and its applications have grown significantly in recent years. With the advent
+of large pretrained language models, several novel methods have been proposed,
+resulting in substantial improvements in ZSL performance. There is a growing
+demand, both in the research community and industry, for a comprehensive ZSL
+framework that facilitates the development and accessibility of the latest
+methods and pretrained models.In this study, we propose a novel ZSL framework
+called Zshot that aims to address the aforementioned challenges. Our primary
+objective is to provide a platform that allows researchers to compare different
+state-of-the-art ZSL methods with standard benchmark datasets. Additionally, we
+have designed our framework to support the industry with readily available APIs
+for production under the standard SpaCy NLP pipeline. Our API is extendible and
+evaluable, moreover, we include numerous enhancements such as boosting the
+accuracy with pipeline ensembling and visualization utilities available as a
+SpaCy extension.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Duet: efficient and scalable hybriD neUral rElation undersTanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13494v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13494v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaixin Zhang, Hongzhi Wang, Yabin Lu, Ziqi Li, Chang Shu, Yu Yan, Donghua Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardinality estimation methods based on probability distribution estimation
+have achieved high-precision estimation results compared to traditional
+methods. However, the most advanced methods suffer from high estimation costs
+due to the sampling method they use when dealing with range queries. Also, such
+a sampling method makes them difficult to differentiate, so the supervision
+signal from the query workload is difficult to train the model to improve the
+accuracy of cardinality estimation. In this paper, we propose a new hybrid and
+deterministic modeling approach (Duet) for the cardinality estimation problem
+which has better efficiency and scalability compared to previous approaches.
+Duet allows for direct cardinality estimation of range queries with
+significantly lower time and memory costs, as well as in a differentiable form.
+As the prediction process of this approach is differentiable, we can
+incorporate queries with larger model estimation errors into the training
+process to address the long-tail distribution problem of model estimation
+errors on high dimensional tables. We evaluate Duet on classical datasets and
+benchmarks, and the results prove the effectiveness of Duet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rational kernel-based interpolation for complex-valued frequency
+  response functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13484v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13484v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julien Bect, Niklas Georg, Ulrich Römer, Sebastian Schöps
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work is concerned with the kernel-based approximation of a
+complex-valued function from data, where the frequency response function of a
+partial differential equation in the frequency domain is of particular
+interest. In this setting, kernel methods are employed more and more
+frequently, however, standard kernels do not perform well. Moreover, the role
+and mathematical implications of the underlying pair of kernels, which arises
+naturally in the complex-valued case, remain to be addressed. We introduce new
+reproducing kernel Hilbert spaces of complex-valued functions, and formulate
+the problem of complex-valued interpolation with a kernel pair as minimum norm
+interpolation in these spaces. Moreover, we combine the interpolant with a
+low-order rational function, where the order is adaptively selected based on a
+new model selection criterion. Numerical results on examples from different
+fields, including electromagnetics and acoustic examples, illustrate the
+performance of the method, also in comparison to available rational
+approximation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages main paper, 6 pages supplement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Combinatorial Auctions and Graph Neural Networks for Local Energy
+  Flexibility Markets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Awadelrahman M. A. Ahmed, Frank Eliassen, Yan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a new combinatorial auction framework for local energy
+flexibility markets, which addresses the issue of prosumers' inability to
+bundle multiple flexibility time intervals. To solve the underlying NP-complete
+winner determination problems, we present a simple yet powerful heterogeneous
+tri-partite graph representation and design graph neural network-based models.
+Our models achieve an average optimal value deviation of less than 5\% from an
+off-the-shelf optimization tool and show linear inference time complexity
+compared to the exponential complexity of the commercial solver. Contributions
+and results demonstrate the potential of using machine learning to efficiently
+allocate energy flexibility resources in local markets and solving optimization
+problems in general.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in The IEEE PES ISGT Europe 2023 (ISGT Europe 2023),
+  Grenoble, France, on October, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gaussian Graph with Prototypical Contrastive Learning in E-Commerce
+  Bundle Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13468v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13468v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhao-Yang Liu, Liucheng Sun, Chenwei Weng, Qijin Chen, Chengfu Huo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bundle recommendation aims to provide a bundle of items to satisfy the user
+preference on e-commerce platform. Existing successful solutions are based on
+the contrastive graph learning paradigm where graph neural networks (GNNs) are
+employed to learn representations from user-level and bundle-level graph views
+with a contrastive learning module to enhance the cooperative association
+between different views. Nevertheless, they ignore the uncertainty issue which
+has a significant impact in real bundle recommendation scenarios due to the
+lack of discriminative information caused by highly sparsity or diversity. We
+further suggest that their instancewise contrastive learning fails to
+distinguish the semantically similar negatives (i.e., sampling bias issue),
+resulting in performance degradation. In this paper, we propose a novel
+Gaussian Graph with Prototypical Contrastive Learning (GPCL) framework to
+overcome these challenges. In particular, GPCL embeds each user/bundle/item as
+a Gaussian distribution rather than a fixed vector. We further design a
+prototypical contrastive learning module to capture the contextual information
+and mitigate the sampling bias issue. Extensive experiments demonstrate that
+benefiting from the proposed components, we achieve new state-of-the-art
+performance compared to previous methods on several public datasets. Moreover,
+GPCL has been deployed on real-world e-commerce platform and achieved
+substantial improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Integrating processed-based models and machine learning for crop yield
+  prediction <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13466v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13466v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michiel G. J. Kallenberg, Bernardo Maestrini, Ron van Bree, Paul Ravensbergen, Christos Pylianidis, Frits van Evert, Ioannis N. Athanasiadis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Crop yield prediction typically involves the utilization of either
+theory-driven process-based crop growth models, which have proven to be
+difficult to calibrate for local conditions, or data-driven machine learning
+methods, which are known to require large datasets. In this work we investigate
+potato yield prediction using a hybrid meta-modeling approach. A crop growth
+model is employed to generate synthetic data for (pre)training a convolutional
+neural net, which is then fine-tuned with observational data. When applied in
+silico, our meta-modeling approach yields better predictions than a baseline
+comprising a purely data-driven approach. When tested on real-world data from
+field trials (n=303) and commercial fields (n=77), the meta-modeling approach
+yields competitive results with respect to the crop growth model. In the latter
+set, however, both models perform worse than a simple linear regression with a
+hand-picked feature set and dedicated preprocessing designed by domain experts.
+Our findings indicate the potential of meta-modeling for accurate crop yield
+prediction; however, further advancements and validation using extensive
+real-world datasets is recommended to solidify its practical effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures, Accepted after peer-review at the 1st workshop on
+  Synergy of Scientific and Machine Learning Modeling, SynS & ML ICML,
+  Honolulu, Hawaii, USA. July, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fundamental causal bounds of quantum random access memories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13460v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13460v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunfei Wang, Yuri Alexeev, Liang Jiang, Frederic T. Chong, Junyu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum devices should operate in adherence to quantum physics principles.
+Quantum random access memory (QRAM), a fundamental component of many essential
+quantum algorithms for tasks such as linear algebra, data search, and machine
+learning, is often proposed to offer $\mathcal{O}(\log N)$ circuit depth for
+$\mathcal{O}(N)$ data size, given $N$ qubits. However, this claim appears to
+breach the principle of relativity when dealing with a large number of qubits
+in quantum materials interacting locally. In our study we critically explore
+the intrinsic bounds of rapid quantum memories based on causality, employing
+the relativistic quantum field theory and Lieb-Robinson bounds in quantum
+many-body systems. In this paper, we consider a hardware-efficient QRAM design
+in hybrid quantum acoustic systems. Assuming clock cycle times of approximately
+$10^{-3}$ seconds and a lattice spacing of about 1 micrometer, we show that
+QRAM can accommodate up to $\mathcal{O}(10^7)$ logical qubits in 1 dimension,
+$\mathcal{O}(10^{15})$ to $\mathcal{O}(10^{20})$ in various 2D architectures,
+and $\mathcal{O}(10^{24})$ in 3 dimensions. We contend that this causality
+bound broadly applies to other quantum hardware systems. Our findings highlight
+the impact of fundamental quantum physics constraints on the long-term
+performance of quantum computing applications in data science and suggest
+potential quantum memory designs for performance enhancement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8+24=32 pages, many figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A behavioural <span class="highlight-title">transformer</span> for effective collaboration between a robot
+  and a non-stationary human 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruaridh Mon-Williams, Theodoros Stouraitis, Sethu Vijayakumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key challenge in human-robot collaboration is the non-stationarity created
+by humans due to changes in their behaviour. This alters environmental
+transitions and hinders human-robot collaboration. We propose a principled
+meta-learning framework to explore how robots could better predict human
+behaviour, and thereby deal with issues of non-stationarity. On the basis of
+this framework, we developed Behaviour-Transform (BeTrans). BeTrans is a
+conditional transformer that enables a robot agent to adapt quickly to new
+human agents with non-stationary behaviours, due to its notable performance
+with sequential data. We trained BeTrans on simulated human agents with
+different systematic biases in collaborative settings. We used an original
+customisable environment to show that BeTrans effectively collaborates with
+simulated human agents and adapts faster to non-stationary simulated human
+agents than SOTA techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Network Traffic Classification based on Single Flow Time Series Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13434v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13434v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josef Koumar, Karel Hynek, Tomáš Čejka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Network traffic monitoring using IP flows is used to handle the current
+challenge of analyzing encrypted network communication. Nevertheless, the
+packet aggregation into flow records naturally causes information loss;
+therefore, this paper proposes a novel flow extension for traffic features
+based on the time series analysis of the Single Flow Time series, i.e., a time
+series created by the number of bytes in each packet and its timestamp. We
+propose 69 universal features based on the statistical analysis of data points,
+time domain analysis, packet distribution within the flow timespan, time series
+behavior, and frequency domain analysis. We have demonstrated the usability and
+universality of the proposed feature vector for various network traffic
+classification tasks using 15 well-known publicly available datasets. Our
+evaluation shows that the novel feature vector achieves classification
+performance similar or better than related works on both binary and multiclass
+classification tasks. In more than half of the evaluated tasks, the
+classification performance increased by up to 5\%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to The 19th International Conference on Network and Service
+  Management (CNSM) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Achieving Linear Speedup in Decentralized Stochastic Compositional
+  Minimax Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongchang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The stochastic compositional minimax problem has attracted a surge of
+attention in recent years since it covers many emerging machine learning
+models. Meanwhile, due to the emergence of distributed data, optimizing this
+kind of problem under the decentralized setting becomes badly needed. However,
+the compositional structure in the loss function brings unique challenges to
+designing efficient decentralized optimization algorithms. In particular, our
+study shows that the standard gossip communication strategy cannot achieve
+linear speedup for decentralized compositional minimax problems due to the
+large consensus error about the inner-level function. To address this issue, we
+developed a novel decentralized stochastic compositional gradient descent
+ascent with momentum algorithm to reduce the consensus error in the inner-level
+function. As such, our theoretical results demonstrate that it is able to
+achieve linear speedup with respect to the number of workers. We believe this
+novel algorithmic design could benefit the development of decentralized
+compositional optimization. Finally, we applied our methods to the imbalanced
+classification problem. The extensive experimental results provide evidence for
+the effectiveness of our algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A signal processing interpretation of noise-reduction convolutional
+  neural networks <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis A. Zavala-Mondragón, Peter H. N. de With, Fons van der Sommen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Encoding-decoding CNNs play a central role in data-driven noise reduction and
+can be found within numerous deep-learning algorithms. However, the development
+of these CNN architectures is often done in ad-hoc fashion and theoretical
+underpinnings for important design choices is generally lacking. Up to this
+moment there are different existing relevant works that strive to explain the
+internal operation of these CNNs. Still, these ideas are either scattered
+and/or may require significant expertise to be accessible for a bigger
+audience. In order to open up this exciting field, this article builds
+intuition on the theory of deep convolutional framelets and explains diverse ED
+CNN architectures in a unified theoretical framework. By connecting basic
+principles from signal processing to the field of deep learning, this
+self-contained material offers significant guidance for designing robust and
+efficient novel CNN architectures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article is currently accepted in IEEE Signal Processing Magazine
+  (SPM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non Intrusive Intelligibility Predictor for Hearing Impaired Individuals
+  using Self Supervised Speech Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13423v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13423v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George Close, Thomas Hain, Stefan Goetze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised speech representations (SSSRs) have been successfully applied
+to a number of speech-processing tasks, e.g. as feature extractor for speech
+quality (SQ) prediction, which is, in turn, relevant for assessment and
+training speech enhancement systems for users with normal or impaired hearing.
+However, exact knowledge of why and how quality-related information is encoded
+well in such representations remains poorly understood. In this work,
+techniques for non-intrusive prediction of SQ ratings are extended to the
+prediction of intelligibility for hearing-impaired users. It is found that
+self-supervised representations are useful as input features to non-intrusive
+prediction models, achieving competitive performance to more complex systems. A
+detailed analysis of the performance depending on Clarity Prediction Challenge
+1 listeners and enhancement systems indicates that more data might be needed to
+allow generalisation to unknown systems and (hearing-impaired) individuals
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ASRU 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the learning Dynamics of Attention Networks <span class="chip">ECAI-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rahul Vashisht, Harish G. Ramaswamy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Attention models are typically learned by optimizing one of three standard
+loss functions that are variously called -- soft attention, hard attention, and
+latent variable marginal likelihood (LVML) attention. All three paradigms are
+motivated by the same goal of finding two models -- a `focus' model that
+`selects' the right \textit{segment} of the input and a `classification' model
+that processes the selected segment into the target label. However, they differ
+significantly in the way the selected segments are aggregated, resulting in
+distinct dynamics and final results. We observe a unique signature of models
+learned using these paradigms and explain this as a consequence of the
+evolution of the classification model under gradient descent when the focus
+model is fixed. We also analyze these paradigms in a simple setting and derive
+closed-form expressions for the parameter trajectory under gradient flow. With
+the soft attention loss, the focus model improves quickly at initialization and
+splutters later on. On the other hand, hard attention loss behaves in the
+opposite fashion. Based on our observations, we propose a simple hybrid
+approach that combines the advantages of the different loss functions and
+demonstrates it on a collection of semi-synthetic and real-world datasets
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint: Accepted at ECAI-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Co-Design of Out-of-Distribution Detectors for Autonomous Emergency
+  Braking Systems <span class="chip">SC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13419v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13419v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Yuhas, Arvind Easwaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning enabled components (LECs), while critical for decision making in
+autonomous vehicles (AVs), are likely to make incorrect decisions when
+presented with samples outside of their training distributions.
+Out-of-distribution (OOD) detectors have been proposed to detect such samples,
+thereby acting as a safety monitor, however, both OOD detectors and LECs
+require heavy utilization of embedded hardware typically found in AVs. For both
+components, there is a tradeoff between non-functional and functional
+performance, and both impact a vehicle's safety. For instance, giving an OOD
+detector a longer response time can increase its accuracy at the expense of the
+LEC. We consider an LEC with binary output like an autonomous emergency braking
+system (AEBS) and use risk, the combination of severity and occurrence of a
+failure, to model the effect of both components' design parameters on each
+other's functional and non-functional performance, as well as their impact on
+system safety. We formulate a co-design methodology that uses this risk model
+to find the design parameters for an OOD detector and LEC that decrease risk
+below that of the baseline system and demonstrate it on a vision based AEBS.
+Using our methodology, we achieve a 42.3% risk reduction while maintaining
+equivalent resource utilization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, ITSC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Communication-Efficient Orchestrations for URLLC Service via
+  Hierarchical Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13415v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13415v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Shi, Milad Ganjalizadeh, Hossein Shokri Ghadikolaei, Marina Petrova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultra-reliable low latency communications (URLLC) service is envisioned to
+enable use cases with strict reliability and latency requirements in 5G. One
+approach for enabling URLLC services is to leverage Reinforcement Learning (RL)
+to efficiently allocate wireless resources. However, with conventional RL
+methods, the decision variables (though being deployed at various network
+layers) are typically optimized in the same control loop, leading to
+significant practical limitations on the control loop's delay as well as
+excessive signaling and energy consumption. In this paper, we propose a
+multi-agent Hierarchical RL (HRL) framework that enables the implementation of
+multi-level policies with different control loop timescales. Agents with faster
+control loops are deployed closer to the base station, while the ones with
+slower control loops are at the edge or closer to the core network providing
+high-level guidelines for low-level actions. On a use case from the prior art,
+with our HRL framework, we optimized the maximum number of retransmissions and
+transmission power of industrial devices. Our extensive simulation results on
+the factory automation scenario show that the HRL framework achieves better
+performance as the baseline single-agent RL method, with significantly less
+overhead of signal transmissions and delay compared to the one-agent RL
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been accepted in IEEE 34th Annual International
+  Symposium on Personal, Indoor and Mobile Radio Communications (PIMRC)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Memory Wall Effects in CNN Engines with On-the-Fly Weights
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13412v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13412v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stylianos I. Venieris, Javier Fernandez-Marques, Nicholas D. Lane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The unprecedented accuracy of convolutional neural networks (CNNs) across a
+broad range of AI tasks has led to their widespread deployment in mobile and
+embedded settings. In a pursuit for high-performance and energy-efficient
+inference, significant research effort has been invested in the design of
+FPGA-based CNN accelerators. In this context, single computation engines
+constitute a popular approach to support diverse CNN modes without the overhead
+of fabric reconfiguration. Nevertheless, this flexibility often comes with
+significantly degraded performance on memory-bound layers and resource
+underutilisation due to the suboptimal mapping of certain layers on the
+engine's fixed configuration. In this work, we investigate the implications in
+terms of CNN engine design for a class of models that introduce a
+pre-convolution stage to decompress the weights at run time. We refer to these
+approaches as on-the-fly. This paper presents unzipFPGA, a novel CNN inference
+system that counteracts the limitations of existing CNN engines. The proposed
+framework comprises a novel CNN hardware architecture that introduces a weights
+generator module that enables the on-chip on-the-fly generation of weights,
+alleviating the negative impact of limited bandwidth on memory-bound layers. We
+further enhance unzipFPGA with an automated hardware-aware methodology that
+tailors the weights generation mechanism to the target CNN-device pair, leading
+to an improved accuracy-performance balance. Finally, we introduce an input
+selective processing element (PE) design that balances the load between PEs in
+suboptimally mapped layers. The proposed framework yields hardware designs that
+achieve an average of 2.57x performance efficiency gain over highly optimised
+GPU designs for the same power constraints and up to 3.94x higher performance
+density over a diverse range of state-of-the-art FPGA-based CNN accelerators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM TODAES, 2023. arXiv admin note: substantial text
+  overlap with arXiv:2103.05600</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Double-Edged Sword of Big Data and Information Technology for the
+  Disadvantaged: A Cautionary Tale from Open Banking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13408v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13408v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Savina Dine Kim, Galina Andreeva, Michael Rovatsos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research article analyses and demonstrates the hidden implications for
+fairness of seemingly neutral data coupled with powerful technology, such as
+machine learning (ML), using Open Banking as an example. Open Banking has
+ignited a revolution in financial services, opening new opportunities for
+customer acquisition, management, retention, and risk assessment. However, the
+granularity of transaction data holds potential for harm where unnoticed
+proxies for sensitive and prohibited characteristics may lead to indirect
+discrimination. Against this backdrop, we investigate the dimensions of
+financial vulnerability (FV), a global concern resulting from COVID-19 and
+rising inflation. Specifically, we look to understand the behavioral elements
+leading up to FV and its impact on at-risk, disadvantaged groups through the
+lens of fair interpretation. Using a unique dataset from a UK FinTech lender,
+we demonstrate the power of fine-grained transaction data while simultaneously
+cautioning its safe usage. Three ML classifiers are compared in predicting the
+likelihood of FV, and groups exhibiting different magnitudes and forms of FV
+are identified via clustering to highlight the effects of feature combination.
+Our results indicate that engineered features of financial behavior can be
+predictive of omitted personal information, particularly sensitive or protected
+characteristics, shedding light on the hidden dangers of Open Banking data. We
+discuss the implications and conclude fairness via unawareness is ineffective
+in this new technological environment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Counterfactual Explanation via Search in Gaussian Mixture Distributed
+  Latent Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13390v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13390v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuan Zhao, Klaus Broelemann, Gjergji Kasneci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactual Explanations (CEs) are an important tool in Algorithmic
+Recourse for addressing two questions: 1. What are the crucial factors that led
+to an automated prediction/decision? 2. How can these factors be changed to
+achieve a more favorable outcome from a user's perspective? Thus, guiding the
+user's interaction with AI systems by proposing easy-to-understand explanations
+and easy-to-attain feasible changes is essential for the trustworthy adoption
+and long-term acceptance of AI systems. In the literature, various methods have
+been proposed to generate CEs, and different quality measures have been
+suggested to evaluate these methods. However, the generation of CEs is usually
+computationally expensive, and the resulting suggestions are unrealistic and
+thus non-actionable. In this paper, we introduce a new method to generate CEs
+for a pre-trained binary classifier by first shaping the latent space of an
+autoencoder to be a mixture of Gaussian distributions. CEs are then generated
+in latent space by linear interpolation between the query sample and the
+centroid of the target class. We show that our method maintains the
+characteristics of the input sample during the counterfactual search. In
+various experiments, we show that the proposed method is competitive based on
+different quality measures on image and tabular datasets -- efficiently returns
+results that are closer to the original data manifold compared to three
+state-of-the-art methods, which are essential for realistic high-dimensional
+machine learning applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BotHawk: An Approach for Bots Detection in Open Source Software Projects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13386v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13386v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fenglin Bi, Zhiwei Zhu, Wei Wang, Xiaoya Xia, Hassan Ali Khan, Peng Pu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social coding platforms have revolutionized collaboration in software
+development, leading to using software bots for streamlining operations.
+However, The presence of open-source software (OSS) bots gives rise to problems
+including impersonation, spamming, bias, and security risks. Identifying bot
+accounts and behavior is a challenging task in the OSS project. This research
+aims to investigate bots' behavior in open-source software projects and
+identify bot accounts with maximum possible accuracy. Our team gathered a
+dataset of 19,779 accounts that meet standardized criteria to enable future
+research on bots in open-source projects. We follow a rigorous workflow to
+ensure that the data we collect is accurate, generalizable, scalable, and
+up-to-date. We've identified four types of bot accounts in open-source software
+projects by analyzing their behavior across 17 features in 5 dimensions. Our
+team created BotHawk, a highly effective model for detecting bots in
+open-source software projects. It outperforms other models, achieving an AUC of
+0.947 and an F1-score of 0.89. BotHawk can detect a wider variety of bots,
+including CI/CD and scanning bots. Furthermore, we find that the number of
+followers, number of repositories, and tags contain the most relevant features
+to identify the account type.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Dataset, Bots Detection, Classification. Open-source Software Bots</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaff-PD: Communication Efficient Fair and Robust Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13381v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13381v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaodong Yu, Sai Praneeth Karimireddy, Yi Ma, Michael I. Jordan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Scaff-PD, a fast and communication-efficient algorithm for
+distributionally robust federated learning. Our approach improves fairness by
+optimizing a family of distributionally robust objectives tailored to
+heterogeneous clients. We leverage the special structure of these objectives,
+and design an accelerated primal dual (APD) algorithm which uses bias corrected
+local steps (as in Scaffold) to achieve significant gains in communication
+efficiency and convergence speed. We evaluate Scaff-PD on several benchmark
+datasets and demonstrate its effectiveness in improving fairness and robustness
+while maintaining competitive accuracy. Our results suggest that Scaff-PD is a
+promising approach for federated learning in resource-constrained and
+heterogeneous settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Submodular Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13372v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13372v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manish Prajapat, Mojmír Mutný, Melanie N. Zeilinger, Andreas Krause
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In reinforcement learning (RL), rewards of states are typically considered
+additive, and following the Markov assumption, they are $\textit{independent}$
+of states visited previously. In many important applications, such as coverage
+control, experiment design and informative path planning, rewards naturally
+have diminishing returns, i.e., their value decreases in light of similar
+states visited previously. To tackle this, we propose $\textit{submodular RL}$
+(SubRL), a paradigm which seeks to optimize more general, non-additive (and
+history-dependent) rewards modelled via submodular set functions which capture
+diminishing returns. Unfortunately, in general, even in tabular settings, we
+show that the resulting optimization problem is hard to approximate. On the
+other hand, motivated by the success of greedy algorithms in classical
+submodular optimization, we propose SubPO, a simple policy gradient-based
+algorithm for SubRL that handles non-additive rewards by greedily maximizing
+marginal gains. Indeed, under some assumptions on the underlying Markov
+Decision Process (MDP), SubPO recovers optimal constant factor approximations
+of submodular bandits. Moreover, we derive a natural policy gradient approach
+for locally optimizing SubRL instances even in large state- and action- spaces.
+We showcase the versatility of our approach by applying SubPO to several
+applications, such as biodiversity monitoring, Bayesian experiment design,
+informative path planning, and coverage maximization. Our results demonstrate
+sample efficiency, as well as scalability to high-dimensional state-action
+spaces.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Regions of Interest for Bayesian Optimization with Adaptive
+  Level-Set Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13371v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13371v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengxue Zhang, Jialin Song, James Bowden, Alexander Ladd, Yisong Yue, Thomas A. Desautels, Yuxin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study Bayesian optimization (BO) in high-dimensional and non-stationary
+scenarios. Existing algorithms for such scenarios typically require extensive
+hyperparameter tuning, which limits their practical effectiveness. We propose a
+framework, called BALLET, which adaptively filters for a high-confidence region
+of interest (ROI) as a superlevel-set of a nonparametric probabilistic model
+such as a Gaussian process (GP). Our approach is easy to tune, and is able to
+focus on local region of the optimization space that can be tackled by existing
+BO methods. The key idea is to use two probabilistic models: a coarse GP to
+identify the ROI, and a localized GP for optimization within the ROI. We show
+theoretically that BALLET can efficiently shrink the search space, and can
+exhibit a tighter regret bound than standard BO without ROI filtering. We
+demonstrate empirically the effectiveness of BALLET on both synthetic and
+real-world optimization tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computational Guarantees for Doubly Entropic Wasserstein Barycenters via
+  Damped Sinkhorn Iterations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13370v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13370v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lénaïc Chizat, Tomas Vaškevičius
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the computation of doubly regularized Wasserstein barycenters, a
+recently introduced family of entropic barycenters governed by inner and outer
+regularization strengths. Previous research has demonstrated that various
+regularization parameter choices unify several notions of entropy-penalized
+barycenters while also revealing new ones, including a special case of debiased
+barycenters. In this paper, we propose and analyze an algorithm for computing
+doubly regularized Wasserstein barycenters. Our procedure builds on damped
+Sinkhorn iterations followed by exact maximization/minimization steps and
+guarantees convergence for any choice of regularization parameters. An inexact
+variant of our algorithm, implementable using approximate Monte Carlo sampling,
+offers the first non-asymptotic convergence guarantees for approximating
+Wasserstein barycenters between discrete point clouds in the
+free-support/grid-free setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High Dimensional Distributed Gradient Descent with Arbitrary Number of
+  Byzantine Attackers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Puning Zhao, Zhiguo Wan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust distributed learning with Byzantine failures has attracted extensive
+research interests in recent years. However, most of existing methods suffer
+from curse of dimensionality, which is increasingly serious with the growing
+complexity of modern machine learning models. In this paper, we design a new
+method that is suitable for high dimensional problems, under arbitrary number
+of Byzantine attackers. The core of our design is a direct high dimensional
+semi-verified mean estimation method. Our idea is to identify a subspace first.
+The components of mean value perpendicular to this subspace can be estimated
+via gradient vectors uploaded from worker machines, while the components within
+this subspace are estimated using auxiliary dataset. We then use our new method
+as the aggregator of distributed learning problems. Our theoretical analysis
+shows that the new method has minimax optimal statistical rates. In particular,
+the dependence on dimensionality is significantly improved compared with
+previous works.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature Importance Measurement based on Decision Tree Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Huang, Diptesh Das, Koji Tsuda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Random forest is effective for prediction tasks but the randomness of tree
+generation hinders interpretability in feature importance analysis. To address
+this, we proposed DT-Sampler, a SAT-based method for measuring feature
+importance in tree-based model. Our method has fewer parameters than random
+forest and provides higher interpretability and stability for the analysis in
+real-world problems. An implementation of DT-Sampler is available at
+https://github.com/tsudalab/DT-sampler.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Optimal Approximation Factors in Misspecified Off-Policy Value
+  Function Estimation <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philip Amortila, Nan Jiang, Csaba Szepesvári
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Theoretical guarantees in reinforcement learning (RL) are known to suffer
+multiplicative blow-up factors with respect to the misspecification error of
+function approximation. Yet, the nature of such \emph{approximation factors} --
+especially their optimal form in a given learning problem -- is poorly
+understood. In this paper we study this question in linear off-policy value
+function estimation, where many open questions remain. We study the
+approximation factor in a broad spectrum of settings, such as with the weighted
+$L_2$-norm (where the weighting is the offline state distribution), the
+$L_\infty$ norm, the presence vs. absence of state aliasing, and full vs.
+partial coverage of the state space. We establish the optimal asymptotic
+approximation factors (up to constants) for all of these settings. In
+particular, our bounds identify two instance-dependent factors for the
+$L_2(\mu)$ norm and only one for the $L_\infty$ norm, which are shown to
+dictate the hardness of off-policy evaluation under misspecification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2023. The arXiv version contains improved results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QuIP: 2-Bit Quantization of Large Language Models With Guarantees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jerry Chee, Yaohui Cai, Volodymyr Kuleshov, Christopher De Sa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work studies post-training parameter quantization in large language
+models (LLMs). We introduce quantization with incoherence processing (QuIP), a
+new method based on the insight that quantization benefits from incoherent
+weight and Hessian matrices, i.e., from the weights and the directions in which
+it is important to round them accurately being unaligned with the coordinate
+axes. QuIP consists of two steps: (1) an adaptive rounding procedure minimizing
+a quadratic proxy objective; (2) efficient pre- and post-processing that
+ensures weight and Hessian incoherence via multiplication by random orthogonal
+matrices. We complement QuIP with the first theoretical analysis for an
+LLM-scale quantization algorithm, and show that our theory also applies to an
+existing method, OPTQ. Empirically, we find that our incoherence preprocessing
+improves several existing quantization algorithms and yields the first LLM
+quantization methods that produce viable results using only two bits per
+weight. Our code can be found at https://github.com/jerry-chee/QuIP .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modify Training Directions in Function Space to Reduce Generalization
+  Error 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13290v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13290v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Yu, Wenlian Lu, Boyu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose theoretical analyses of a modified natural gradient descent method
+in the neural network function space based on the eigendecompositions of neural
+tangent kernel and Fisher information matrix. We firstly present analytical
+expression for the function learned by this modified natural gradient under the
+assumptions of Gaussian distribution and infinite width limit. Thus, we
+explicitly derive the generalization error of the learned neural network
+function using theoretical methods from eigendecomposition and statistics
+theory. By decomposing of the total generalization error attributed to
+different eigenspace of the kernel in function space, we propose a criterion
+for balancing the errors stemming from training set and the distribution
+discrepancy between the training set and the true data. Through this approach,
+we establish that modifying the training direction of the neural network in
+function space leads to a reduction in the total generalization error.
+Furthermore, We demonstrate that this theoretical framework is capable to
+explain many existing results of generalization enhancing methods. These
+theoretical results are also illustrated by numerical examples on synthetic
+data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Curvature-based <span class="highlight-title">Transformer</span> for Molecular Property Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yili Chen, Zhengyu Li, Zheng Wan, Hui Yu, Xian Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prediction of molecular properties is one of the most important and
+challenging tasks in the field of artificial intelligence-based drug design.
+Among the current mainstream methods, the most commonly used feature
+representation for training DNN models is based on SMILES and molecular graphs,
+although these methods are concise and effective, they also limit the ability
+to capture spatial information. In this work, we propose Curvature-based
+Transformer to improve the ability of Graph Transformer neural network models
+to extract structural information on molecular graph data by introducing
+Discretization of Ricci Curvature. To embed the curvature in the model, we add
+the curvature information of the graph as positional Encoding to the node
+features during the attention-score calculation. This method can introduce
+curvature information from graph data without changing the original network
+architecture, and it has the potential to be extended to other models. We
+performed experiments on chemical molecular datasets including PCQM4M-LST,
+MoleculeNet and compared with models such as Uni-Mol, Graphormer, and the
+results show that this method can achieve the state-of-the-art results. It is
+proved that the discretized Ricci curvature also reflects the structural and
+functional relationship while describing the local geometry of the graph
+molecular data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unbiased Weight Maximization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13270v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13270v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephen Chung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A biologically plausible method for training an Artificial Neural Network
+(ANN) involves treating each unit as a stochastic Reinforcement Learning (RL)
+agent, thereby considering the network as a team of agents. Consequently, all
+units can learn via REINFORCE, a local learning rule modulated by a global
+reward signal, which aligns more closely with biologically observed forms of
+synaptic plasticity. Nevertheless, this learning method is often slow and
+scales poorly with network size due to inefficient structural credit
+assignment, since a single reward signal is broadcast to all units without
+considering individual contributions. Weight Maximization, a proposed solution,
+replaces a unit's reward signal with the norm of its outgoing weight, thereby
+allowing each hidden unit to maximize the norm of the outgoing weight instead
+of the global reward signal. In this research report, we analyze the
+theoretical properties of Weight Maximization and propose a variant, Unbiased
+Weight Maximization. This new approach provides an unbiased learning rule that
+increases learning speed and improves asymptotic performance. Notably, to our
+knowledge, this is the first learning rule for a network of Bernoulli-logistic
+units that is unbiased and scales well with the number of network's units in
+terms of learning speed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated K-Means Clustering via Dual Decomposition-based Distributed
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vassilios Yfantis, Achim Wagner, Martin Ruskowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of distributed optimization in machine learning can be motivated
+either by the resulting preservation of privacy or the increase in
+computational efficiency. On the one hand, training data might be stored across
+multiple devices. Training a global model within a network where each node only
+has access to its confidential data requires the use of distributed algorithms.
+Even if the data is not confidential, sharing it might be prohibitive due to
+bandwidth limitations. On the other hand, the ever-increasing amount of
+available data leads to large-scale machine learning problems. By splitting the
+training process across multiple nodes its efficiency can be significantly
+increased. This paper aims to demonstrate how dual decomposition can be applied
+for distributed training of $ K $-means clustering problems. After an overview
+of distributed and federated machine learning, the mixed-integer quadratically
+constrained programming-based formulation of the $ K $-means clustering
+training problem is presented. The training can be performed in a distributed
+manner by splitting the data across different nodes and linking these nodes
+through consensus constraints. Finally, the performance of the subgradient
+method, the bundle trust method, and the quasi-Newton dual ascent algorithm are
+evaluated on a set of benchmark problems. While the mixed-integer
+programming-based formulation of the clustering problems suffers from weak
+integer relaxations, the presented approach can potentially be used to enable
+an efficient solution in the future, both in a central and distributed setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Split Learning with Only Positive Labels for
+  resource-constrained IoT environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Praveen Joshi, Chandra Thapa, Mohammed Hasanuzzaman, Ted Scully, Haithem Afli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distributed collaborative machine learning (DCML) is a promising method in
+the Internet of Things (IoT) domain for training deep learning models, as data
+is distributed across multiple devices. A key advantage of this approach is
+that it improves data privacy by removing the necessity for the centralized
+aggregation of raw data but also empowers IoT devices with low computational
+power. Among various techniques in a DCML framework, federated split learning,
+known as splitfed learning (SFL), is the most suitable for efficient training
+and testing when devices have limited computational capabilities. Nevertheless,
+when resource-constrained IoT devices have only positive labeled data,
+multiclass classification deep learning models in SFL fail to converge or
+provide suboptimal results. To overcome these challenges, we propose splitfed
+learning with positive labels (SFPL). SFPL applies a random shuffling function
+to the smashed data received from clients before supplying it to the server for
+model training. Additionally, SFPL incorporates the local batch normalization
+for the client-side model portion during the inference phase. Our results
+demonstrate that SFPL outperforms SFL: (i) by factors of 51.54 and 32.57 for
+ResNet-56 and ResNet-32, respectively, with the CIFAR-100 dataset, and (ii) by
+factors of 9.23 and 8.52 for ResNet-32 and ResNet-8, respectively, with
+CIFAR-10 dataset. Overall, this investigation underscores the efficacy of the
+proposed SFPL framework in DCML.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Structural Credit Assignment with Coordinated Exploration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephen Chung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A biologically plausible method for training an Artificial Neural Network
+(ANN) involves treating each unit as a stochastic Reinforcement Learning (RL)
+agent, thereby considering the network as a team of agents. Consequently, all
+units can learn via REINFORCE, a local learning rule modulated by a global
+reward signal, which aligns more closely with biologically observed forms of
+synaptic plasticity. However, this learning method tends to be slow and does
+not scale well with the size of the network. This inefficiency arises from two
+factors impeding effective structural credit assignment: (i) all units
+independently explore the network, and (ii) a single reward is used to evaluate
+the actions of all units. Accordingly, methods aimed at improving structural
+credit assignment can generally be classified into two categories. The first
+category includes algorithms that enable coordinated exploration among units,
+such as MAP propagation. The second category encompasses algorithms that
+compute a more specific reward signal for each unit within the network, like
+Weight Maximization and its variants. In this research report, our focus is on
+the first category. We propose the use of Boltzmann machines or a recurrent
+network for coordinated exploration. We show that the negative phase, which is
+typically necessary to train Boltzmann machines, can be removed. The resulting
+learning rules are similar to the reward-modulated Hebbian learning rule.
+Experimental results demonstrate that coordinated exploration significantly
+exceeds independent exploration in training speed for multiple stochastic and
+discrete units based on REINFORCE, even surpassing straight-through estimator
+(STE) backpropagation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoSAS: Deep Semi-Supervised Anomaly Detection with
+  Contamination-Resilient Continuous Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongzuo Xu, Yijie Wang, Guansong Pang, Songlei Jian, Ning Liu, Yongjun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised anomaly detection methods leverage a few anomaly examples to
+yield drastically improved performance compared to unsupervised models.
+However, they still suffer from two limitations: 1) unlabeled anomalies (i.e.,
+anomaly contamination) may mislead the learning process when all the unlabeled
+data are employed as inliers for model training; 2) only discrete supervision
+information (such as binary or ordinal data labels) is exploited, which leads
+to suboptimal learning of anomaly scores that essentially take on a continuous
+distribution. Therefore, this paper proposes a novel semi-supervised anomaly
+detection method, which devises \textit{contamination-resilient continuous
+supervisory signals}. Specifically, we propose a mass interpolation method to
+diffuse the abnormality of labeled anomalies, thereby creating new data samples
+labeled with continuous abnormal degrees. Meanwhile, the contaminated area can
+be covered by new data samples generated via combinations of data with correct
+labels. A feature learning-based objective is added to serve as an optimization
+constraint to regularize the network and further enhance the robustness w.r.t.
+anomaly contamination. Extensive experiments on 11 real-world datasets show
+that our approach significantly outperforms state-of-the-art competitors by
+20%-30% in AUC-PR and obtains more robust and superior performance in settings
+with different anomaly contamination levels and varying numbers of labeled
+anomalies. The source code is available at https://github.com/xuhongzuo/rosas/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Information Processing and Management (IP&M)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio-aware Query-enhanced <span class="highlight-title">Transformer</span> for Audio-Visual Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinxiang Liu, Chen Ju, Chaofan Ma, Yanfeng Wang, Yu Wang, Ya Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of the audio-visual segmentation (AVS) task is to segment the
+sounding objects in the video frames using audio cues. However, current
+fusion-based methods have the performance limitations due to the small
+receptive field of convolution and inadequate fusion of audio-visual features.
+To overcome these issues, we propose a novel \textbf{Au}dio-aware
+query-enhanced \textbf{TR}ansformer (AuTR) to tackle the task. Unlike existing
+methods, our approach introduces a multimodal transformer architecture that
+enables deep fusion and aggregation of audio-visual features. Furthermore, we
+devise an audio-aware query-enhanced transformer decoder that explicitly helps
+the model focus on the segmentation of the pinpointed sounding objects based on
+audio signals, while disregarding silent yet salient objects. Experimental
+results show that our method outperforms previous methods and demonstrates
+better generalization ability in multi-sound and open-set scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2305.11019</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spectral-DP: Differentially Private Deep Learning through Spectral
+  Perturbation and Filtering <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13231v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13231v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ce Feng, Nuo Xu, Wujie Wen, Parv Venkitasubramaniam, Caiwen Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Differential privacy is a widely accepted measure of privacy in the context
+of deep learning algorithms, and achieving it relies on a noisy training
+approach known as differentially private stochastic gradient descent (DP-SGD).
+DP-SGD requires direct noise addition to every gradient in a dense neural
+network, the privacy is achieved at a significant utility cost. In this work,
+we present Spectral-DP, a new differentially private learning approach which
+combines gradient perturbation in the spectral domain with spectral filtering
+to achieve a desired privacy guarantee with a lower noise scale and thus better
+utility. We develop differentially private deep learning methods based on
+Spectral-DP for architectures that contain both convolution and fully connected
+layers. In particular, for fully connected layers, we combine a block-circulant
+based spatial restructuring with Spectral-DP to achieve better utility. Through
+comprehensive experiments, we study and provide guidelines to implement
+Spectral-DP deep learning on benchmark datasets. In comparison with
+state-of-the-art DP-SGD based approaches, Spectral-DP is shown to have
+uniformly better utility performance in both training from scratch and transfer
+learning settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in 2023 IEEE Symposium on Security and Privacy (SP)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Primer on the Data Cleaning Pipeline 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13219v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13219v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rebecca C. Steorts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The availability of both structured and unstructured databases, such as
+electronic health data, social media data, patent data, and surveys that are
+often updated in real time, among others, has grown rapidly over the past
+decade. With this expansion, the statistical and methodological questions
+around data integration, or rather merging multiple data sources, has also
+grown. Specifically, the science of the ``data cleaning pipeline'' contains
+four stages that allow an analyst to perform downstream tasks, predictive
+analyses, or statistical analyses on ``cleaned data.'' This article provides a
+review of this emerging field, introducing technical terminology and commonly
+used methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedMEKT: Distillation-based Embedding Knowledge Transfer for Multimodal
+  Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13214v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13214v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huy Q. Le, Minh N. H. Nguyen, Chu Myaet Thwal, Yu Qiao, Chaoning Zhang, Choong Seon Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) enables a decentralized machine learning paradigm for
+multiple clients to collaboratively train a generalized global model without
+sharing their private data. Most existing works simply propose typical FL
+systems for single-modal data, thus limiting its potential on exploiting
+valuable multimodal data for future personalized applications. Furthermore, the
+majority of FL approaches still rely on the labeled data at the client side,
+which is limited in real-world applications due to the inability of
+self-annotation from users. In light of these limitations, we propose a novel
+multimodal FL framework that employs a semi-supervised learning approach to
+leverage the representations from different modalities. Bringing this concept
+into a system, we develop a distillation-based multimodal embedding knowledge
+transfer mechanism, namely FedMEKT, which allows the server and clients to
+exchange the joint knowledge of their learning models extracted from a small
+multimodal proxy dataset. Our FedMEKT iteratively updates the generalized
+global encoders with the joint embedding knowledge from the participating
+clients. Thereby, to address the modality discrepancy and labeled data
+constraint in existing FL systems, our proposed FedMEKT comprises local
+multimodal autoencoder learning, generalized multimodal autoencoder
+construction, and generalized classifier learning. Through extensive
+experiments on three multimodal human activity recognition datasets, we
+demonstrate that FedMEKT achieves superior global encoder performance on linear
+evaluation and guarantees user privacy for personal data and model parameters
+while demanding less communication cost than other baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transferability of Graph Neural Networks using Graphon and Sampling
+  Theories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13206v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13206v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A. Martina Neuman, Jason J. Bramburger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have become powerful tools for processing
+graph-based information in various domains. A desirable property of GNNs is
+transferability, where a trained network can swap in information from a
+different graph without retraining and retain its accuracy. A recent method of
+capturing transferability of GNNs is through the use of graphons, which are
+symmetric, measurable functions representing the limit of large dense graphs.
+In this work, we contribute to the application of graphons to GNNs by
+presenting an explicit two-layer graphon neural network (WNN) architecture. We
+prove its ability to approximate bandlimited signals within a specified error
+tolerance using a minimal number of network weights. We then leverage this
+result, to establish the transferability of an explicit two-layer GNN over all
+sufficiently large graphs in a sequence converging to a graphon. Our work
+addresses transferability between both deterministic weighted graphs and simple
+random graphs and overcomes issues related to the curse of dimensionality that
+arise in other GNN results. The proposed WNN and GNN architectures offer
+practical solutions for handling graph data of varying sizes while maintaining
+performance guarantees without extensive retraining.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Investigation into Glomeruli Detection in Kidney H&E and PAS Images
+  using YOLO 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kimia Hemmatirad, Morteza Babaie, Jeffrey Hodgin, Liron Pantanowitz, H. R. Tizhoosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Context: Analyzing digital pathology images is necessary to draw diagnostic
+conclusions by investigating tissue patterns and cellular morphology. However,
+manual evaluation can be time-consuming, expensive, and prone to inter- and
+intra-observer variability. Objective: To assist pathologists using
+computerized solutions, automated tissue structure detection and segmentation
+must be proposed. Furthermore, generating pixel-level object annotations for
+histopathology images is expensive and time-consuming. As a result, detection
+models with bounding box labels may be a feasible solution. Design: This paper
+studies. YOLO-v4 (You-Only-Look-Once), a real-time object detector for
+microscopic images. YOLO uses a single neural network to predict several
+bounding boxes and class probabilities for objects of interest. YOLO can
+enhance detection performance by training on whole slide images. YOLO-v4 has
+been used in this paper. for glomeruli detection in human kidney images.
+Multiple experiments have been designed and conducted based on different
+training data of two public datasets and a private dataset from the University
+of Michigan for fine-tuning the model. The model was tested on the private
+dataset from the University of Michigan, serving as an external validation of
+two different stains, namely hematoxylin and eosin (H&E) and periodic
+acid-Schiff (PAS). Results: Average specificity and sensitivity for all
+experiments, and comparison of existing segmentation methods on the same
+datasets are discussed. Conclusions: Automated glomeruli detection in human
+kidney images is possible using modern AI models. The design and validation for
+different stains still depends on variability of public multi-stain datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Counterfactual Explanation Policies in RL <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shripad V. Deshmukh, Srivatsan R, Supriti Vijay, Jayakumar Subramanian, Chirag Agarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Reinforcement Learning (RL) agents are increasingly employed in diverse
+decision-making problems using reward preferences, it becomes important to
+ensure that policies learned by these frameworks in mapping observations to a
+probability distribution of the possible actions are explainable. However,
+there is little to no work in the systematic understanding of these complex
+policies in a contrastive manner, i.e., what minimal changes to the policy
+would improve/worsen its performance to a desired level. In this work, we
+present COUNTERPOL, the first framework to analyze RL policies using
+counterfactual explanations in the form of minimal changes to the policy that
+lead to the desired outcome. We do so by incorporating counterfactuals in
+supervised learning in RL with the target outcome regulated using desired
+return. We establish a theoretical connection between Counterpol and widely
+used trust region-based policy optimization methods in RL. Extensive empirical
+analysis shows the efficacy of COUNTERPOL in generating explanations for
+(un)learning skills while keeping close to the original policy. Our results on
+five different RL environments with diverse state and action spaces demonstrate
+the utility of counterfactual explanations, paving the way for new frontiers in
+designing and developing counterfactual policies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML Workshop on Counterfactuals in Minds and Machines, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Memory Decoding with EEG Data and Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13181v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13181v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Glenn Bruns, Michael Haidar, Federico Rubino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We describe a method for the neural decoding of memory from EEG data. Using
+this method, a concept being recalled can be identified from an EEG trace with
+an average top-1 accuracy of about 78.4% (chance 4%). The method employs deep
+representation learning with supervised contrastive loss to map an EEG
+recording of brain activity to a low-dimensional space. Because representation
+learning is used, concepts can be identified even if they do not appear in the
+training data set. However, reference EEG data must exist for each such
+concept. We also show an application of the method to the problem of
+information retrieval. In neural information retrieval, EEG data is captured
+while a user recalls the contents of a document, and a list of links to
+predicted documents is produced.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pretrain</span>ed Deep 2.5D Models for Efficient Predictive Modeling from
+  Retinal OCT <span class="chip">MICCAI'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taha Emre, Marzieh Oghbaie, Arunava Chakravarty, Antoine Rivail, Sophie Riedl, Julia Mai, Hendrik P. N. Scholl, Sobha Sivaprasad, Daniel Rueckert, Andrew Lotery, Ursula Schmidt-Erfurth, Hrvoje Bogunović
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of medical imaging, 3D deep learning models play a crucial role
+in building powerful predictive models of disease progression. However, the
+size of these models presents significant challenges, both in terms of
+computational resources and data requirements. Moreover, achieving high-quality
+pretraining of 3D models proves to be even more challenging. To address these
+issues, hybrid 2.5D approaches provide an effective solution for utilizing 3D
+volumetric data efficiently using 2D models. Combining 2D and 3D techniques
+offers a promising avenue for optimizing performance while minimizing memory
+requirements. In this paper, we explore 2.5D architectures based on a
+combination of convolutional neural networks (CNNs), long short-term memory
+(LSTM), and Transformers. In addition, leveraging the benefits of recent
+non-contrastive pretraining approaches in 2D, we enhanced the performance and
+data efficiency of 2.5D techniques even further. We demonstrate the
+effectiveness of architectures and associated pretraining on a task of
+predicting progression to wet age-related macular degeneration (AMD) within a
+six-month period on two large longitudinal OCT datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at OMIA-X MICCAI'23 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Design Analog Circuits to Meet Threshold Specifications <span class="chip">ICML 23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dmitrii Krylov, Pooya Khajeh, Junhan Ouyang, Thomas Reeves, Tongkai Liu, Hiba Ajmal, Hamidreza Aghasi, Roy Fox
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated design of analog and radio-frequency circuits using supervised or
+reinforcement learning from simulation data has recently been studied as an
+alternative to manual expert design. It is straightforward for a design agent
+to learn an inverse function from desired performance metrics to circuit
+parameters. However, it is more common for a user to have threshold performance
+criteria rather than an exact target vector of feasible performance measures.
+In this work, we propose a method for generating from simulation data a dataset
+on which a system can be trained via supervised learning to design circuits to
+meet threshold specifications. We moreover perform the to-date most extensive
+evaluation of automated analog circuit design, including experimenting in a
+significantly more diverse set of circuits than in prior work, covering linear,
+nonlinear, and autonomous circuit configurations, and show that our method
+consistently reaches success rate better than 90% at 5% error margin, while
+also improving data efficiency by upward of an order of magnitude. A demo of
+this system is available at circuits.streamlit.app
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in proceedings of ICML 23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the unreasonable vulnerability of <span class="highlight-title">transformer</span>s for image restoration
+  -- and an easy fix 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13856v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13856v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Agnihotri, Kanchana Vaishnavi Gandikota, Julia Grabinski, Paramanand Chandramouli, Margret Keuper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Following their success in visual recognition tasks, Vision
+Transformers(ViTs) are being increasingly employed for image restoration. As a
+few recent works claim that ViTs for image classification also have better
+robustness properties, we investigate whether the improved adversarial
+robustness of ViTs extends to image restoration. We consider the recently
+proposed Restormer model, as well as NAFNet and the "Baseline network" which
+are both simplified versions of a Restormer. We use Projected Gradient Descent
+(PGD) and CosPGD, a recently proposed adversarial attack tailored to pixel-wise
+prediction tasks for our robustness evaluation. Our experiments are performed
+on real-world images from the GoPro dataset for image deblurring. Our analysis
+indicates that contrary to as advocated by ViTs in image classification works,
+these models are highly susceptible to adversarial attacks. We attempt to
+improve their robustness through adversarial training. While this yields a
+significant increase in robustness for Restormer, results on other networks are
+less promising. Interestingly, the design choices in NAFNet and Baselines,
+which were based on iid performance, and not on robust generalization, seem to
+be at odds with the model robustness. Thus, we investigate this further and
+find a fix.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Tags: Robustness, adversarial attacks, image deblurring, image
+  restoration, NAFNet, Baseline, Restormer, adversarial training</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Sharpened Cosine Similarity <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13855v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13855v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Skyler Wu, Fred Lu, Edward Raff, James Holt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional layers have long served as the primary workhorse for image
+classification. Recently, an alternative to convolution was proposed using the
+Sharpened Cosine Similarity (SCS), which in theory may serve as a better
+feature detector. While multiple sources report promising results, there has
+not been to date a full-scale empirical analysis of neural network performance
+using these new layers. In our work, we explore SCS's parameter behavior and
+potential as a drop-in replacement for convolutions in multiple CNN
+architectures benchmarked on CIFAR-10. We find that while SCS may not yield
+significant increases in accuracy, it may learn more interpretable
+representations. We also find that, in some circumstances, SCS may confer a
+slight increase in adversarial robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to I Can't Believe It's Not Better Workshop (ICBINB) at
+  NeurIPS 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WebArena: A Realistic Web Environment for Building Autonomous Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyan Zhou, Frank F. Xu, Hao Zhu, Xuhui Zhou, Robert Lo, Abishek Sridhar, Xianyi Cheng, Yonatan Bisk, Daniel Fried, Uri Alon, Graham Neubig
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With generative AI advances, the exciting potential for autonomous agents to
+manage daily tasks via natural language commands has emerged. However, cur rent
+agents are primarily created and tested in simplified synthetic environments,
+substantially limiting real-world scenario representation. In this paper, we
+build an environment for agent command and control that is highly realistic and
+reproducible. Specifically, we focus on agents that perform tasks on websites,
+and we create an environment with fully functional websites from four common
+domains: e-commerce, social forum discussions, collaborative software
+development, and content management. Our environment is enriched with tools
+(e.g., a map) and external knowledge bases (e.g., user manuals) to encourage
+human-like task-solving. Building upon our environment, we release a set of
+benchmark tasks focusing on evaluating the functional correctness of task
+completions. The tasks in our benchmark are diverse, long-horizon, and are
+designed to emulate tasks that humans routinely perform on the internet. We
+design and implement several autonomous agents, integrating recent techniques
+such as reasoning before acting. The results demonstrate that solving complex
+tasks is challenging: our best GPT-4-based agent only achieves an end-to-end
+task success rate of 10.59%. These results highlight the need for further
+development of robust agents, that current state-of-the-art LMs are far from
+perfect performance in these real-life tasks, and that WebArena can be used to
+measure such progress. Our code, data, environment reproduction resources, and
+video demonstrations are publicly available at https://webarena.dev/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SplitFed resilience to packet loss: Where to split, that is the question <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chamani Shiranthika, Zahra Hafezi Kafshgari, Parvaneh Saeedi, Ivan V. Bajić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decentralized machine learning has broadened its scope recently with the
+invention of Federated Learning (FL), Split Learning (SL), and their hybrids
+like Split Federated Learning (SplitFed or SFL). The goal of SFL is to reduce
+the computational power required by each client in FL and parallelize SL while
+maintaining privacy. This paper investigates the robustness of SFL against
+packet loss on communication links. The performance of various SFL aggregation
+strategies is examined by splitting the model at two points -- shallow split
+and deep split -- and testing whether the split point makes a statistically
+significant difference to the accuracy of the final model. Experiments are
+carried out on a segmentation model for human embryo images and indicate the
+statistically significant advantage of a deeper split point.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures, MICCAI 2023 Workshop on Distributed,
+  Collaborative and Federated Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAEA: Multimodal Attribution for Embodied AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13850v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13850v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vidhi Jain, Jayant Sravan Tamarapalli, Sahiti Yerramilli, Yonatan Bisk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding multimodal perception for embodied AI is an open question
+because such inputs may contain highly complementary as well as redundant
+information for the task. A relevant direction for multimodal policies is
+understanding the global trends of each modality at the fusion layer. To this
+end, we disentangle the attributions for visual, language, and previous action
+inputs across different policies trained on the ALFRED dataset. Attribution
+analysis can be utilized to rank and group the failure scenarios, investigate
+modeling and dataset biases, and critically analyze multimodal EAI policies for
+robustness and user trust before deployment. We present MAEA, a framework to
+compute global attributions per modality of any differentiable policy. In
+addition, we show how attributions enable lower-level behavior analysis in EAI
+policies for language and visual attributions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relationship between Batch Size and Number of Steps Needed for Nonconvex
+  Optimization of Stochastic Gradient Descent using Armijo Line Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13831v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13831v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuki Tsukada, Hideaki Iiduka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic gradient descent (SGD) is the simplest deep learning optimizer
+with which to train deep neural networks. While SGD can use various learning
+rates, such as constant or diminishing rates, the previous numerical results
+showed that SGD performs better than other deep learning optimizers using when
+it uses learning rates given by line search methods. In this paper, we perform
+a convergence analysis on SGD with a learning rate given by an Armijo line
+search for nonconvex optimization. The analysis indicates that the upper bound
+of the expectation of the squared norm of the full gradient becomes small when
+the number of steps and the batch size are large. Next, we show that, for SGD
+with the Armijo-line-search learning rate, the number of steps needed for
+nonconvex optimization is a monotone decreasing convex function of the batch
+size; that is, the number of steps needed for nonconvex optimization decreases
+as the batch size increases. Furthermore, we show that the stochastic
+first-order oracle (SFO) complexity, which is the stochastic gradient
+computation cost, is a convex function of the batch size; that is, there exists
+a critical batch size that minimizes the SFO complexity. Finally, we provide
+numerical results that support our theoretical results. The numerical results
+indicate that the number of steps needed for training deep neural networks
+decreases as the batch size increases and that there exist the critical batch
+sizes that can be estimated from the theoretical results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Offline Reinforcement Learning with On-Policy Q-Function Regularization <span class="chip">ECML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13824v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13824v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laixi Shi, Robert Dadashi, Yuejie Chi, Pablo Samuel Castro, Matthieu Geist
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The core challenge of offline reinforcement learning (RL) is dealing with the
+(potentially catastrophic) extrapolation error induced by the distribution
+shift between the history dataset and the desired policy. A large portion of
+prior work tackles this challenge by implicitly/explicitly regularizing the
+learning policy towards the behavior policy, which is hard to estimate reliably
+in practice. In this work, we propose to regularize towards the Q-function of
+the behavior policy instead of the behavior policy itself, under the premise
+that the Q-function can be estimated more reliably and easily by a SARSA-style
+estimate and handles the extrapolation error more straightforwardly. We propose
+two algorithms taking advantage of the estimated Q-function through
+regularizations, and demonstrate they exhibit strong performance on the D4RL
+benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at European Conference on Machine Learning (ECML), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geometric Wavelet Scattering Networks on Compact Riemannian Manifolds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1905.10448v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1905.10448v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Perlmutter, Feng Gao, Guy Wolf, Matthew Hirn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Euclidean scattering transform was introduced nearly a decade ago to
+improve the mathematical understanding of convolutional neural networks.
+Inspired by recent interest in geometric deep learning, which aims to
+generalize convolutional neural networks to manifold and graph-structured
+domains, we define a geometric scattering transform on manifolds. Similar to
+the Euclidean scattering transform, the geometric scattering transform is based
+on a cascade of wavelet filters and pointwise nonlinearities. It is invariant
+to local isometries and stable to certain types of diffeomorphisms. Empirical
+results demonstrate its utility on several geometric learning tasks. Our
+results generalize the deformation stability and local translation invariance
+of Euclidean scattering, and demonstrate the importance of linking the used
+filter structures to the underlying geometry of the data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages; 3 figures; 2 tables; v4: Fixed a minor error. Convergence
+  in Equation 13 is in L2 not p.w. modified proof of Theorem 3.3 accordingly</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stabilizing <span class="highlight-title">Transformer</span> Training by Preventing Attention Entropy
+  Collapse 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06296v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06296v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuangfei Zhai, Tatiana Likhomanenko, Etai Littwin, Dan Busbridge, Jason Ramapuram, Yizhe Zhang, Jiatao Gu, Josh Susskind
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training stability is of great importance to Transformers. In this work, we
+investigate the training dynamics of Transformers by examining the evolution of
+the attention layers. In particular, we track the attention entropy for each
+attention head during the course of training, which is a proxy for model
+sharpness. We identify a common pattern across different architectures and
+tasks, where low attention entropy is accompanied by high training instability,
+which can take the form of oscillating loss or divergence. We denote the
+pathologically low attention entropy, corresponding to highly concentrated
+attention scores, as $\textit{entropy collapse}$. As a remedy, we propose
+$\sigma$Reparam, a simple and efficient solution where we reparametrize all
+linear layers with spectral normalization and an additional learned scalar. We
+demonstrate that $\sigma$Reparam successfully prevents entropy collapse in the
+attention layers, promoting more stable training. Additionally, we prove a
+tight lower bound of the attention entropy, which decreases exponentially fast
+with the spectral norm of the attention logits, providing additional motivation
+for our approach. We conduct experiments with $\sigma$Reparam on image
+classification, image self-supervised learning, machine translation, speech
+recognition, and language modeling tasks. We show that $\sigma$Reparam provides
+stability and robustness with respect to the choice of hyperparameters, going
+so far as enabling training (a) a Vision Transformer {to competitive
+performance} without warmup, weight decay, layer normalization or adaptive
+optimizers; (b) deep architectures in machine translation and (c) speech
+recognition to competitive performance without warmup and adaptive optimizers.
+Code is available at \url{https://github.com/apple/ml-sigma-reparam}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficiently Learning One-Hidden-Layer ReLU Networks via Schur
+  Polynomials 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12840v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12840v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilias Diakonikolas, Daniel M. Kane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of PAC learning a linear combination of $k$ ReLU
+activations under the standard Gaussian distribution on $\mathbb{R}^d$ with
+respect to the square loss. Our main result is an efficient algorithm for this
+learning task with sample and computational complexity $(dk/\epsilon)^{O(k)}$,
+where $\epsilon>0$ is the target accuracy. Prior work had given an algorithm
+for this problem with complexity $(dk/\epsilon)^{h(k)}$, where the function
+$h(k)$ scales super-polynomially in $k$. Interestingly, the complexity of our
+algorithm is near-optimal within the class of Correlational Statistical Query
+algorithms. At a high-level, our algorithm uses tensor decomposition to
+identify a subspace such that all the $O(k)$-order moments are small in the
+orthogonal directions. Its analysis makes essential use of the theory of Schur
+polynomials to show that the higher-moment error tensors are small given that
+the lower-order ones are.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Accelerated primal-dual methods with enlarged step sizes and operator
+  learning for nonsmooth optimal control problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00296v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00296v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongcun Song, Xiaoming Yuan, Hangrui Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider a general class of nonsmooth optimal control problems with
+partial differential equation (PDE) constraints, which are very challenging due
+to its nonsmooth objective functionals and the resulting high-dimensional and
+ill-conditioned systems after discretization. We focus on the application of a
+primal-dual method, with which different types of variables can be treated
+individually and thus its main computation at each iteration only requires
+solving two PDEs. Our target is to accelerate the primal-dual method with
+either larger step sizes or operator learning techniques. For the accelerated
+primal-dual method with larger step sizes, its convergence can be still proved
+rigorously while it numerically accelerates the original primal-dual method in
+a simple and universal way. For the operator learning acceleration, we
+construct deep neural network surrogate models for the involved PDEs. Once a
+neural operator is learned, solving a PDE requires only a forward pass of the
+neural network, and the computational cost is thus substantially reduced. The
+accelerated primal-dual method with operator learning is mesh-free, numerically
+efficient, and scalable to different types of PDEs. The acceleration
+effectiveness of these two techniques is promisingly validated by some
+preliminary numerical results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sharp Convergence Rates for Matching Pursuit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07679v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07679v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason M. Klusowski, Jonathan W. Siegel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the fundamental limits of matching pursuit, or the pure greedy
+algorithm, for approximating a target function by a sparse linear combination
+of elements from a dictionary. When the target function is contained in the
+variation space corresponding to the dictionary, many impressive works over the
+past few decades have obtained upper and lower bounds on the error of matching
+pursuit, but they do not match. The main contribution of this paper is to close
+this gap and obtain a sharp characterization of the decay rate of matching
+pursuit. Specifically, we construct a worst case dictionary which shows that
+the existing best upper bound cannot be significantly improved. It turns out
+that, unlike other greedy algorithm variants, the converge rate is suboptimal
+and is determined by the solution to a certain non-linear equation. This
+enables us to conclude that any amount of shrinkage improves matching pursuit
+in the worst case.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizing DP-SGD with Shuffling and Batch Clipping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05796v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05796v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marten van Dijk, Phuong Ha Nguyen, Toan N. Nguyen, Lam M. Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classical differential private DP-SGD implements individual clipping with
+random subsampling, which forces a mini-batch SGD approach. We provide a
+general differential private algorithmic framework that goes beyond DP-SGD and
+allows any possible first order optimizers (e.g., classical SGD and momentum
+based SGD approaches) in combination with batch clipping, which clips an
+aggregate of computed gradients rather than summing clipped gradients (as is
+done in individual clipping). The framework also admits sampling techniques
+beyond random subsampling such as shuffling. Our DP analysis follows the $f$-DP
+approach and introduces a new proof technique which allows us to derive simple
+closed form expressions and to also analyse group privacy. In particular, for
+$E$ epochs work and groups of size $g$, we show a $\sqrt{g E}$ DP dependency
+for batch clipping with shuffling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Update disclaimers</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> video <span class="highlight-title">pretrain</span>ing yields human-aligned visual
+  representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06433v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06433v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Parthasarathy, S. M. Ali Eslami, João Carreira, Olivier J. Hénaff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans learn powerful representations of objects and scenes by observing how
+they evolve over time. Yet, outside of specific tasks that require explicit
+temporal understanding, static image pretraining remains the dominant paradigm
+for learning visual foundation models. We question this mismatch, and ask
+whether video pretraining can yield visual representations that bear the
+hallmarks of human perception: generalisation across tasks, robustness to
+perturbations, and consistency with human judgements. To that end we propose a
+novel procedure for curating videos, and develop a contrastive framework which
+learns from the complex transformations therein. This simple paradigm for
+distilling knowledge from videos, called VITO, yields general representations
+that far outperform prior video pretraining methods on image understanding
+tasks, and image pretraining methods on video understanding tasks. Moreover,
+VITO representations are significantly more robust to natural and synthetic
+deformations than image-, video-, and adversarially-trained ones. Finally,
+VITO's predictions are strongly aligned with human judgements, surpassing
+models that were specifically trained for that purpose. Together, these results
+suggest that video pretraining could be a simple way of learning unified,
+robust, and human-aligned representations of the visual world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unification of popular artificial neural network activation functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11007v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11007v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Mostafanejad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a unified representation of the most popular neural network
+activation functions. Adopting Mittag-Leffler functions of fractional calculus,
+we propose a flexible and compact functional form that is able to interpolate
+between various activation functions and mitigate common problems in training
+neural networks such as vanishing and exploding gradients. The presented gated
+representation extends the scope of fixed-shape activation functions to their
+adaptive counterparts whose shape can be learnt from the training data. The
+derivatives of the proposed functional form can also be expressed in terms of
+Mittag-Leffler functions making it a suitable candidate for gradient-based
+backpropagation algorithms. By training multiple neural networks of different
+complexities on various datasets with different sizes, we demonstrate that
+adopting a unified gated representation of activation functions offers a
+promising and affordable alternative to individual built-in implementations of
+activation functions in conventional machine learning frameworks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The present revised version includes new results on ShuffleNet-v2 and
+  ResNet-101 neural networks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variability of echo state network prediction horizon for partially
+  observed dynamical systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10797v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10797v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ajit Mahata, Reetish Padhi, Amit Apte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Study of dynamical systems using partial state observation is an important
+problem due to its applicability to many real-world systems. We address the
+problem by proposing an echo state network (ESN) framework with partial state
+input with partial or full state output. Application to the Lorenz system and
+Chua's oscillator (both numerically simulated and experimental systems)
+demonstrate the effectiveness of our method. We show that the ESN, as an
+autonomous dynamical system, is capable of making short-term predictions up to
+a few Lyapunov times. However, the prediction horizon has high variability
+depending on the initial condition - an aspect that we explore in detail using
+the distribution of the prediction horizon. Further, using a variety of
+statistical metrics to compare the long-term dynamics of the ESN predictions
+with numerically simulated or experimental dynamics and observed similar
+results, we show that the ESN can effectively learn the system's dynamics even
+when trained with noisy numerical or experimental datasets. Thus, we
+demonstrate the potential of ESNs to serve as cheap surrogate models for
+simulating the dynamics of systems where complete observations are unavailable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Reinforcement Learning-Assisted Federated Learning for Robust
+  Short-term Utility Demand Forecasting in Electricity Wholesale Markets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.11715v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.11715v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenghao Huang, Weilong Chen, Shengrong Bu, Yanru Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Short-term load forecasting (STLF) plays a significant role in the operation
+of electricity trading markets. Considering the growing concern of data
+privacy, federated learning (FL) is increasingly adopted to train STLF models
+for utility companies (UCs) in recent research. Inspiringly, in wholesale
+markets, as it is not realistic for power plants (PPs) to access UCs' data
+directly, FL is definitely a feasible solution of obtaining an accurate STLF
+model for PPs. However, due to FL's distributed nature and intense competition
+among UCs, defects increasingly occur and lead to poor performance of the STLF
+model, indicating that simply adopting FL is not enough. In this paper, we
+propose a DRL-assisted FL approach, DEfect-AwaRe federated soft actor-critic
+(DearFSAC), to robustly train an accurate STLF model for PPs to forecast
+precise short-term utility electricity demand. Firstly. we design a STLF model
+based on long short-term memory (LSTM) using just historical load data and time
+data. Furthermore, considering the uncertainty of defects occurrence, a deep
+reinforcement learning (DRL) algorithm is adopted to assist FL by alleviating
+model degradation caused by defects. In addition, for faster convergence of FL
+training, an auto-encoder is designed for both dimension reduction and quality
+evaluation of uploaded models. In the simulations, we validate our approach on
+real data of Helsinki's UCs in 2019. The results show that DearFSAC outperforms
+all the other approaches no matter if defects occur or not.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Replica Analysis of the Linear Model with Markov or Hidden Markov Signal
+  Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2009.13370v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2009.13370v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lan V. Truong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper estimates free energy, average mutual information, and minimum
+mean square error (MMSE) of a linear model under two assumptions: (1) the
+source is generated by a Markov chain, (2) the source is generated via a hidden
+Markov model. Our estimates are based on the replica method in statistical
+physics. We show that under the posterior mean estimator, the linear model with
+Markov sources or hidden Markov sources is decoupled into single-input AWGN
+channels with state information available at both encoder and decoder where the
+state distribution follows the left Perron-Frobenius eigenvector with unit
+Manhattan norm of the stochastic matrix of Markov chains. Numerical results
+show that the free energies and MSEs obtained via the replica method are
+closely approximate to their counterparts achieved by the Metropolis-Hastings
+algorithm or some well-known approximate message passing algorithms in the
+research literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A shorter version to appear in IEEE Transactions on Information
+  Theory (accepted in July 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ marl-jax: Multi-Agent Reinforcement Leaning Framework <span class="chip">ECML-PKDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13808v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13808v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kinal Mehta, Anuj Mahajan, Pawan Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in Reinforcement Learning (RL) have led to many exciting
+applications. These advancements have been driven by improvements in both
+algorithms and engineering, which have resulted in faster training of RL
+agents. We present marl-jax, a multi-agent reinforcement learning software
+package for training and evaluating social generalization of the agents. The
+package is designed for training a population of agents in multi-agent
+environments and evaluating their ability to generalize to diverse background
+agents. It is built on top of DeepMind's JAX ecosystem~\cite{deepmind2020jax}
+and leverages the RL ecosystem developed by DeepMind. Our framework marl-jax is
+capable of working in cooperative and competitive, simultaneous-acting
+environments with multiple agents. The package offers an intuitive and
+user-friendly command-line interface for training a population and evaluating
+its generalization capabilities. In conclusion, marl-jax provides a valuable
+resource for researchers interested in exploring social generalization in the
+context of MARL. The open-source code for marl-jax is available at:
+\href{https://github.com/kinalmehta/marl-jax}{https://github.com/kinalmehta/marl-jax}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECML-PKDD 2023 Demo Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scalable Stochastic Gradient Riemannian Langevin Dynamics in
+  Non-Diagonal Metrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05101v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05101v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanlin Yu, Marcelo Hartmann, Bernardo Williams, Arto Klami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic-gradient sampling methods are often used to perform Bayesian
+inference on neural networks. It has been observed that the methods in which
+notions of differential geometry are included tend to have better performances,
+with the Riemannian metric improving posterior exploration by accounting for
+the local curvature. However, the existing methods often resort to simple
+diagonal metrics to remain computationally efficient. This loses some of the
+gains. We propose two non-diagonal metrics that can be used in
+stochastic-gradient samplers to improve convergence and exploration but have
+only a minor computational overhead over diagonal metrics. We show that for
+fully connected neural networks (NNs) with sparsity-inducing priors and
+convolutional NNs with correlated priors, using these metrics can provide
+improvements. For some other choices the posterior is sufficiently easy also
+for the simpler metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MaxMin-L2-SVC-NCH: A Novel Approach for Support Vector Classifier
+  Training and Parameter Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07343v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07343v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linkai Luo, Qiaoling Yang, Hong Peng, Yiding Wang, Ziyang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The selection of Gaussian kernel parameters plays an important role in the
+applications of support vector classification (SVC). A commonly used method is
+the k-fold cross validation with grid search (CV), which is extremely
+time-consuming because it needs to train a large number of SVC models. In this
+paper, a new approach is proposed to train SVC and optimize the selection of
+Gaussian kernel parameters. We first formulate the training and parameter
+selection of SVC as a minimax optimization problem named as MaxMin-L2-SVC-NCH,
+in which the minimization problem is an optimization problem of finding the
+closest points between two normal convex hulls (L2-SVC-NCH) while the
+maximization problem is an optimization problem of finding the optimal Gaussian
+kernel parameters. A lower time complexity can be expected in MaxMin-L2-SVC-NCH
+because CV is not needed. We then propose a projected gradient algorithm (PGA)
+for training L2-SVC-NCH. The famous sequential minimal optimization (SMO)
+algorithm is a special case of the PGA. Thus, the PGA can provide more
+flexibility than the SMO. Furthermore, the solution of the maximization problem
+is done by a gradient ascent algorithm with dynamic learning rate. The
+comparative experiments between MaxMin-L2-SVC-NCH and the previous best
+approaches on public datasets show that MaxMin-L2-SVC-NCH greatly reduces the
+number of models to be trained while maintaining competitive test accuracy.
+These findings indicate that MaxMin-L2-SVC-NCH is a better choice for SVC
+tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Agents For Attacking Inaudible Voice Activated Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12204v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12204v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Forrest McKee, David Noever
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paper applies reinforcement learning to novel Internet of Thing
+configurations. Our analysis of inaudible attacks on voice-activated devices
+confirms the alarming risk factor of 7.6 out of 10, underlining significant
+security vulnerabilities scored independently by NIST National Vulnerability
+Database (NVD). Our baseline network model showcases a scenario in which an
+attacker uses inaudible voice commands to gain unauthorized access to
+confidential information on a secured laptop. We simulated many attack
+scenarios on this baseline network model, revealing the potential for mass
+exploitation of interconnected devices to discover and own privileged
+information through physical access without adding new hardware or amplifying
+device skills. Using Microsoft's CyberBattleSim framework, we evaluated six
+reinforcement learning algorithms and found that Deep-Q learning with
+exploitation proved optimal, leading to rapid ownership of all nodes in fewer
+steps. Our findings underscore the critical need for understanding
+non-conventional networks and new cybersecurity measures in an ever-expanding
+digital landscape, particularly those characterized by mobile devices, voice
+activation, and non-linear microphones susceptible to malicious actors
+operating stealth attacks in the near-ultrasound or inaudible ranges. By 2024,
+this new attack surface might encompass more digital voice assistants than
+people on the planet yet offer fewer remedies than conventional patching or
+firmware fixes since the inaudible attacks arise inherently from the microphone
+design and digital signal processing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrating Curricula with Replays: Its Effects on Continual Learning <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05747v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05747v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ren Jie Tee, Mengmi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans engage in learning and reviewing processes with curricula when
+acquiring new skills or knowledge. This human learning behavior has inspired
+the integration of curricula with replay methods in continual learning agents.
+The goal is to emulate the human learning process, thereby improving knowledge
+retention and facilitating learning transfer. Existing replay methods in
+continual learning agents involve the random selection and ordering of data
+from previous tasks, which has shown to be effective. However, limited research
+has explored the integration of different curricula with replay methods to
+enhance continual learning. Our study takes initial steps in examining the
+impact of integrating curricula with replay methods on continual learning in
+three specific aspects: the interleaved frequency of replayed exemplars with
+training data, the sequence in which exemplars are replayed, and the strategy
+for selecting exemplars into the replay buffer. These aspects of curricula
+design align with cognitive psychology principles and leverage the benefits of
+interleaved practice during replays, easy-to-hard rehearsal, and exemplar
+selection strategy involving exemplars from a uniform distribution of
+difficulties. Based on our results, these three curricula effectively mitigated
+catastrophic forgetting and enhanced positive knowledge transfer, demonstrating
+the potential of curricula in advancing continual learning methodologies. Our
+code and data are available:
+https://github.com/ZhangLab-DeepNeuroCogLab/Integrating-Curricula-with-Replays
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, accepted in AAAI Summer Symposium Series
+  Proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Soft <span class="highlight-title">Prompt</span> Tuning for Augmenting Dense Retrieval with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08303v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08303v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Peng, Xuyang Wu, Yi Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dense retrieval (DR) converts queries and documents into dense embeddings and
+measures the similarity between queries and documents in vector space. One of
+the challenges in DR is the lack of domain-specific training data. While DR
+models can learn from large-scale public datasets like MS MARCO through
+transfer learning, evidence shows that not all DR models and domains can
+benefit from transfer learning equally. Recently, some researchers have
+resorted to large language models (LLMs) to improve the zero-shot and few-shot
+DR models. However, the hard prompts or human-written prompts utilized in these
+works cannot guarantee the good quality of generated weak queries. To tackle
+this, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,
+we leverage soft prompt-tuning to optimize a task-specific soft prompt on
+limited ground truth data and then prompt the LLMs to tag unlabeled documents
+with weak queries, yielding enough weak document-query pairs to train
+task-specific dense retrievers. We design a filter to select high-quality
+example document-query pairs in the prompt to further improve the quality of
+weak tagged queries. To the best of our knowledge, there is no prior work
+utilizing soft prompt tuning to augment DR models. The experiments demonstrate
+that SPTAR outperforms the unsupervised baselines BM25 and the recently
+proposed LLMs-based augmentation method for DR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>fix typo InPairs which should be InPars</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-Invasive Fairness in Learning through the Lens of Data Drift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17566v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17566v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Yang, Alexandra Meliou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine Learning (ML) models are widely employed to drive many modern data
+systems. While they are undeniably powerful tools, ML models often demonstrate
+imbalanced performance and unfair behaviors. The root of this problem often
+lies in the fact that different subpopulations commonly display divergent
+trends: as a learning algorithm tries to identify trends in the data, it
+naturally favors the trends of the majority groups, leading to a model that
+performs poorly and unfairly for minority populations. Our goal is to improve
+the fairness and trustworthiness of ML models by applying only non-invasive
+interventions, i.e., without altering the data or the learning algorithm. We
+use a simple but key insight: the divergence of trends between different
+populations, and, consecutively, between a learned model and minority
+populations, is analogous to data drift, which indicates the poor conformance
+between parts of the data and the trained model. We explore two strategies
+(model-splitting and reweighing) to resolve this drift, aiming to improve the
+overall conformance of models to the underlying data. Both our methods
+introduce novel ways to employ the recently-proposed data profiling primitive
+of Conformance Constraints. Our experimental evaluation over 7 real-world
+datasets shows that both DifFair and ConFair improve the fairness of ML models.
+We demonstrate scenarios where DifFair has an edge, though ConFair has the
+greatest practical impact and outperforms other baselines. Moreover, as a
+model-agnostic technique, ConFair stays robust when used against different
+models than the ones on which the weights have been learned, which is not the
+case for other state of the art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Optimal Fair Classification Trees: Trade-offs Between
+  Interpretability, Fairness, and Accuracy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.09932v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.09932v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathanael Jo, Sina Aghaei, Andrés Gómez, Phebe Vayanos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing use of machine learning in high-stakes domains -- where
+people's livelihoods are impacted -- creates an urgent need for interpretable,
+fair, and highly accurate algorithms. With these needs in mind, we propose a
+mixed integer optimization (MIO) framework for learning optimal classification
+trees -- one of the most interpretable models -- that can be augmented with
+arbitrary fairness constraints. In order to better quantify the "price of
+interpretability", we also propose a new measure of model interpretability
+called decision complexity that allows for comparisons across different classes
+of machine learning models. We benchmark our method against state-of-the-art
+approaches for fair classification on popular datasets; in doing so, we conduct
+one of the first comprehensive analyses of the trade-offs between
+interpretability, fairness, and predictive accuracy. Given a fixed disparity
+threshold, our method has a price of interpretability of about 4.2 percentage
+points in terms of out-of-sample accuracy compared to the best performing,
+complex models. However, our method consistently finds decisions with almost
+full parity, while other methods rarely do.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Harmonizing Feature Attributions Across Deep Learning Architectures:
+  Enhancing Interpretability and Consistency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02150v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02150v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Abdul Kadir, Gowtham Krishna Addluri, Daniel Sonntag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring the trustworthiness and interpretability of machine learning models
+is critical to their deployment in real-world applications. Feature attribution
+methods have gained significant attention, which provide local explanations of
+model predictions by attributing importance to individual input features. This
+study examines the generalization of feature attributions across various deep
+learning architectures, such as convolutional neural networks (CNNs) and vision
+transformers. We aim to assess the feasibility of utilizing a feature
+attribution method as a future detector and examine how these features can be
+harmonized across multiple models employing distinct architectures but trained
+on the same data distribution. By exploring this harmonization, we aim to
+develop a more coherent and optimistic understanding of feature attributions,
+enhancing the consistency of local explanations across diverse deep-learning
+models. Our findings highlight the potential for harmonized feature attribution
+methods to improve interpretability and foster trust in machine learning
+applications, regardless of the underlying architecture.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version of the contribution has been submitted in KI2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Meta-Referential Games to Learn Compositional Learning Behaviours 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.08012v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.08012v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Denamganaï, Sondess Missaoui, James Alfred Walker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human beings use compositionality to generalise from past experiences to
+novel experiences. We assume a separation of our experiences into fundamental
+atomic components that can be recombined in novel ways to support our ability
+to engage with novel experiences. We frame this as the ability to learn to
+generalise compositionally, and we will refer to behaviours making use of this
+ability as compositional learning behaviours (CLBs). A central problem to
+learning CLBs is the resolution of a binding problem (BP). While it is another
+feat of intelligence that human beings perform with ease, it is not the case
+for state-of-the-art artificial agents. Thus, in order to build artificial
+agents able to collaborate with human beings, we propose to develop a novel
+benchmark to investigate agents' abilities to exhibit CLBs by solving a
+domain-agnostic version of the BP. We take inspiration from the language
+emergence and grounding framework of referential games and propose a
+meta-learning extension of referential games, entitled Meta-Referential Games,
+and use this framework to build our benchmark, that we name Symbolic Behaviour
+Benchmark (S2B). We provide baseline results showing that our benchmark is a
+compelling challenge that we hope will spur the research community towards
+developing more capable artificial agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DataComp: In search of the next generation of multimodal <span class="highlight-title">dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14108v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14108v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samir Yitzhak Gadre, Gabriel Ilharco, Alex Fang, Jonathan Hayase, Georgios Smyrnis, Thao Nguyen, Ryan Marten, Mitchell Wortsman, Dhruba Ghosh, Jieyu Zhang, Eyal Orgad, Rahim Entezari, Giannis Daras, Sarah Pratt, Vivek Ramanujan, Yonatan Bitton, Kalyani Marathe, Stephen Mussmann, Richard Vencu, Mehdi Cherti, Ranjay Krishna, Pang Wei Koh, Olga Saukh, Alexander Ratner, Shuran Song, Hannaneh Hajishirzi, Ali Farhadi, Romain Beaumont, Sewoong Oh, Alex Dimakis, Jenia Jitsev, Yair Carmon, Vaishaal Shankar, Ludwig Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal datasets are a critical component in recent breakthroughs such as
+Stable Diffusion and GPT-4, yet their design does not receive the same research
+attention as model architectures or training algorithms. To address this
+shortcoming in the ML ecosystem, we introduce DataComp, a testbed for dataset
+experiments centered around a new candidate pool of 12.8 billion image-text
+pairs from Common Crawl. Participants in our benchmark design new filtering
+techniques or curate new data sources and then evaluate their new dataset by
+running our standardized CLIP training code and testing the resulting model on
+38 downstream test sets. Our benchmark consists of multiple compute scales
+spanning four orders of magnitude, which enables the study of scaling trends
+and makes the benchmark accessible to researchers with varying resources. Our
+baseline experiments show that the DataComp workflow leads to better training
+sets. In particular, our best baseline, DataComp-1B, enables training a CLIP
+ViT-L/14 from scratch to 79.2% zero-shot accuracy on ImageNet, outperforming
+OpenAI's CLIP ViT-L/14 by 3.7 percentage points while using the same training
+procedure and compute. We release DataComp and all accompanying code at
+www.datacomp.ai.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ G-invariant diffusion maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07350v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07350v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eitan Rosen, Xiuyuan Cheng, Yoel Shkolnisky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The diffusion maps embedding of data lying on a manifold have shown success
+in tasks ranging from dimensionality reduction and clustering, to data
+visualization. In this work, we consider embedding data sets which were sampled
+from a manifold which is closed under the action of a continuous matrix group.
+An example of such a data set is images who's planar rotations are arbitrary.
+The G-invariant graph Laplacian, introduced in a previous work of the authors,
+admits eigenfunctions in the form of tensor products between the elements of
+the irreducible unitary representations of the group and eigenvectors of
+certain matrices. We employ these eigenfunctions to derive diffusion maps that
+intrinsically account for the group action on the data. In particular, we
+construct both equivariant and invariant embeddings which can be used naturally
+to cluster and align the data points. We demonstrate the effectiveness of our
+construction with simulated data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedTracker: Furnishing Ownership Verification and Traceability for
+  Federated Learning Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.07160v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.07160v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Shao, Wenyuan Yang, Hanlin Gu, Zhan Qin, Lixin Fan, Qiang Yang, Kui Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a distributed machine learning paradigm allowing
+multiple clients to collaboratively train a global model without sharing their
+local data. However, FL entails exposing the model to various participants.
+This poses a risk of unauthorized model distribution or resale by the malicious
+client, compromising the intellectual property rights of the FL group. To deter
+such misbehavior, it is essential to establish a mechanism for verifying the
+ownership of the model and as well tracing its origin to the leaker among the
+FL participants. In this paper, we present FedTracker, the first FL model
+protection framework that provides both ownership verification and
+traceability. FedTracker adopts a bi-level protection scheme consisting of
+global watermark mechanism and local fingerprint mechanism. The former
+authenticates the ownership of the global model, while the latter identifies
+which client the model is derived from. FedTracker leverages Continual Learning
+(CL) principles to embedding the watermark in a way that preserves the utility
+of the FL model on both primitive task and watermark task. FedTracker also
+devises a novel metric to better discriminate different fingerprints.
+Experimental results show FedTracker is effective in ownership verification,
+traceability, and maintains good fidelity and robustness against various
+watermark removal attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Faster Predict-and-Optimize with Davis-Yin Splitting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.13395v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.13395v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel McKenzie, Samy Wu Fung, Howard Heaton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many applications, a combinatorial problem must be repeatedly solved with
+similar, but distinct parameters. Yet, the parameters $w$ are not directly
+observed; only contextual data $d$ that correlates with $w$ is available. It is
+tempting to use a neural network to predict $w$ given $d$, but training such a
+model requires reconciling the discrete nature of combinatorial optimization
+with the gradient-based frameworks used to train neural networks. When the
+problem in question is an Integer Linear Program (ILP), one approach to
+overcoming this issue is to consider a continuous relaxation of the
+combinatorial problem. While existing methods utilizing this approach have
+shown to be highly effective on small problems (10-100 variables), they do not
+scale well to large problems. In this work, we draw on ideas from modern convex
+optimization to design a network and training scheme which scales effortlessly
+to problems with thousands of variables.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TEFL: Turbo Explainable Federated Learning for 6G Trustworthy Zero-Touch
+  Network Slicing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.10147v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.10147v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Swastika Roy, Hatim Chergui, Christos Verikoukis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sixth-generation (6G) networks anticipate intelligently supporting a massive
+number of coexisting and heterogeneous slices associated with various vertical
+use cases. Such a context urges the adoption of artificial intelligence
+(AI)-driven zero-touch management and orchestration (MANO) of the end-to-end
+(E2E) slices under stringent service level agreements (SLAs). Specifically, the
+trustworthiness of the AI black-boxes in real deployment can be achieved by
+explainable AI (XAI) tools to build transparency between the interacting actors
+in the slicing ecosystem, such as tenants, infrastructure providers and
+operators. Inspired by the turbo principle, this paper presents a novel
+iterative explainable federated learning (FL) approach where a constrained
+resource allocation model and an \emph{explainer} exchange -- in a closed loop
+(CL) fashion -- soft attributions of the features as well as inference
+predictions to achieve a transparent and SLA-aware zero-touch service
+management (ZSM) of 6G network slices at RAN-Edge setup under non-independent
+identically distributed (non-IID) datasets. In particular, we quantitatively
+validate the faithfulness of the explanations via the so-called
+attribution-based \emph{confidence metric} that is included as a constraint in
+the run-time FL optimization task. In this respect, Integrated-Gradient (IG) as
+well as Input $\times$ Gradient and SHAP are used to generate the attributions
+for the turbo explainable FL (TEFL), wherefore simulation results under
+different methods confirm its superiority over an unconstrained
+Integrated-Gradient \emph{post-hoc} FL baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Overlapes with the new version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Deep Learning Approach for Overall Survival prediction in Lung Cancer
+  with Missing Values 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11465v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11465v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Camillo Maria Caruso, Valerio Guarrasi, Sara Ramella, Paolo Soda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the most challenging fields where Artificial Intelligence (AI) can be
+applied is lung cancer research, specifically non-small cell lung cancer
+(NSCLC). In particular, overall survival (OS), the time between diagnosis and
+death, is a vital indicator of patient status, enabling tailored treatment and
+improved OS rates. In this analysis, there are two challenges to take into
+account. First, few studies effectively exploit the information available from
+each patient, leveraging both uncensored (i.e., dead) and censored (i.e.,
+survivors) patients, considering also the events' time. Second, the handling of
+incomplete data is a common issue in the medical field. This problem is
+typically tackled through the use of imputation methods. Our objective is to
+present an AI model able to overcome these limits, effectively learning from
+both censored and uncensored patients and their available features, for the
+prediction of OS for NSCLC patients. We present a novel approach to survival
+analysis with missing values in the context of NSCLC, which exploits the
+strengths of the transformer architecture to account only for available
+features without requiring any imputation strategy. By making use of ad-hoc
+losses for OS, it is able to account for both censored and uncensored patients,
+as well as changes in risks over time. We compared our method with
+state-of-the-art models for survival analysis coupled with different imputation
+strategies. We evaluated the results obtained over a period of 6 years using
+different time granularities obtaining a Ct-index, a time-dependent variant of
+the C-index, of 71.97, 77.58 and 80.72 for time units of 1 month, 1 year and 2
+years, respectively, outperforming all state-of-the-art methods regardless of
+the imputation method used.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-linear Neurons with Human-like Apical Dendrite Activations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2003.03229v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2003.03229v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mariana-Iuliana Georgescu, Radu Tudor Ionescu, Nicolae-Catalin Ristea, Nicu Sebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In order to classify linearly non-separable data, neurons are typically
+organized into multi-layer neural networks that are equipped with at least one
+hidden layer. Inspired by some recent discoveries in neuroscience, we propose a
+new model of artificial neuron along with a novel activation function enabling
+the learning of nonlinear decision boundaries using a single neuron. We show
+that a standard neuron followed by our novel apical dendrite activation (ADA)
+can learn the XOR logical function with 100% accuracy. Furthermore, we conduct
+experiments on six benchmark data sets from computer vision, signal processing
+and natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST,
+Tiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions
+provide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and
+Swish, for various neural network architectures, e.g. one-hidden-layer or
+two-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural
+networks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain
+further performance improvements when we change the standard model of the
+neuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our
+code is available at: https://github.com/raduionescu/pynada.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Applied Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nonparametric Linear Feature Learning in Regression Through
+  Regularisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12754v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12754v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bertille Follain, Umut Simsekli, Francis Bach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representation learning plays a crucial role in automated feature selection,
+particularly in the context of high-dimensional data, where non-parametric
+methods often struggle. In this study, we focus on supervised learning
+scenarios where the pertinent information resides within a lower-dimensional
+linear subspace of the data, namely the multi-index model. If this subspace
+were known, it would greatly enhance prediction, computation, and
+interpretation. To address this challenge, we propose a novel method for linear
+feature learning with non-parametric prediction, which simultaneously estimates
+the prediction function and the linear subspace. Our approach employs empirical
+risk minimisation, augmented with a penalty on function derivatives, ensuring
+versatility. Leveraging the orthogonality and rotation invariance properties of
+Hermite polynomials, we introduce our estimator, named RegFeaL. By utilising
+alternative minimisation, we iteratively rotate the data to improve alignment
+with leading directions and accurately estimate the relevant dimension in
+practical settings. We establish that our method yields a consistent estimator
+of the prediction function with explicit rates. Additionally, we provide
+empirical results demonstrating the performance of RegFeaL in various
+experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting the Robustness of the Minimum Error Entropy Criterion: A
+  Transfer Learning Case Study <span class="chip">ECAI-23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08572v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08572v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis Pedro Silvestrin, Shujian Yu, Mark Hoogendoorn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coping with distributional shifts is an important part of transfer learning
+methods in order to perform well in real-life tasks. However, most of the
+existing approaches in this area either focus on an ideal scenario in which the
+data does not contain noises or employ a complicated training paradigm or model
+design to deal with distributional shifts. In this paper, we revisit the
+robustness of the minimum error entropy (MEE) criterion, a widely used
+objective in statistical signal processing to deal with non-Gaussian noises,
+and investigate its feasibility and usefulness in real-life transfer learning
+regression tasks, where distributional shifts are common. Specifically, we put
+forward a new theoretical result showing the robustness of MEE against
+covariate shift. We also show that by simply replacing the mean squared error
+(MSE) loss with the MEE on basic transfer learning algorithms such as
+fine-tuning and linear probing, we can achieve competitive performance with
+respect to state-of-the-art transfer learning algorithms. We justify our
+arguments on both synthetic data and 5 real-world time-series data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Manuscript accepted at ECAI-23. Code available at
+  https://github.com/lpsilvestrin/mee-finetune</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TriMLP: Revenge of a MLP-like Architecture in Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14675v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14675v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiheng Jiang, Yuanbo Xu, Yongjian Yang, Funing Yang, Pengyang Wang, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a MLP-like architecture for sequential
+recommendation, namely TriMLP, with a novel Triangular Mixer for cross-token
+communications. In designing Triangular Mixer, we simplify the cross-token
+operation in MLP as the basic matrix multiplication, and drop the
+lower-triangle neurons of the weight matrix to block the anti-chronological
+order connections from future tokens. Accordingly, the information leakage
+issue can be remedied and the prediction capability of MLP can be fully
+excavated under the standard auto-regressive mode. Take a step further, the
+mixer serially alternates two delicate MLPs with triangular shape, tagged as
+global and local mixing, to separately capture the long range dependencies and
+local patterns on fine-grained level, i.e., long and short-term preferences.
+Empirical study on 12 datasets of different scales (50K\textasciitilde 10M
+user-item interactions) from 4 benchmarks (Amazon, MovieLens, Tenrec and LBSN)
+show that TriMLP consistently attains promising accuracy/efficiency trade-off,
+where the average performance boost against several state-of-the-art baselines
+achieves up to 14.88% with 8.65% less inference cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 9 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nonparametric Generative Modeling with Conditional Sliced-Wasserstein
+  Flows <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02164v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02164v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Du, Tianbo Li, Tianyu Pang, Shuicheng Yan, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sliced-Wasserstein Flow (SWF) is a promising approach to nonparametric
+generative modeling but has not been widely adopted due to its suboptimal
+generative quality and lack of conditional modeling capabilities. In this work,
+we make two major contributions to bridging this gap. First, based on a
+pleasant observation that (under certain conditions) the SWF of joint
+distributions coincides with those of conditional distributions, we propose
+Conditional Sliced-Wasserstein Flow (CSWF), a simple yet effective extension of
+SWF that enables nonparametric conditional modeling. Second, we introduce
+appropriate inductive biases of images into SWF with two techniques inspired by
+local connectivity and multiscale representation in vision research, which
+greatly improve the efficiency and quality of modeling images. With all the
+improvements, we achieve generative performance comparable with many deep
+parametric generative models on both conditional and unconditional tasks in a
+purely nonparametric fashion, demonstrating its great potential.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Transformer</span> Training Strategies for Forecasting Multiple Load Time
+  Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10891v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10891v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Hertel, Maximilian Beichter, Benedikt Heidrich, Oliver Neumann, Benjamin Schäfer, Ralf Mikut, Veit Hagenmeyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the smart grid of the future, accurate load forecasts on the level of
+individual clients can help to balance supply and demand locally and to prevent
+grid outages. While the number of monitored clients will increase with the
+ongoing smart meter rollout, the amount of data per client will always be
+limited. We evaluate whether a Transformer load forecasting model benefits from
+a transfer learning strategy, where a global univariate model is trained on the
+load time series from multiple clients. In experiments with two datasets
+containing load time series from several hundred clients, we find that the
+global training strategy is superior to the multivariate and local training
+strategies used in related work. On average, the global training strategy
+results in 21.8% and 12.8% lower forecasting errors than the two other
+strategies, measured across forecasting horizons from one day to one month into
+the future. A comparison to linear models, multi-layer perceptrons and LSTMs
+shows that Transformers are effective for load forecasting when they are
+trained with the global training strategy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Real-time Neural-MPC: Deep Learning Model Predictive Control for
+  Quadrotors and Agile Robotic Platforms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.07747v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.07747v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Salzmann, Elia Kaufmann, Jon Arrizabalaga, Marco Pavone, Davide Scaramuzza, Markus Ryll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model Predictive Control (MPC) has become a popular framework in embedded
+control for high-performance autonomous systems. However, to achieve good
+control performance using MPC, an accurate dynamics model is key. To maintain
+real-time operation, the dynamics models used on embedded systems have been
+limited to simple first-principle models, which substantially limits their
+representative power. In contrast to such simple models, machine learning
+approaches, specifically neural networks, have been shown to accurately model
+even complex dynamic effects, but their large computational complexity hindered
+combination with fast real-time iteration loops. With this work, we present
+Real-time Neural MPC, a framework to efficiently integrate large, complex
+neural network architectures as dynamics models within a model-predictive
+control pipeline. Our experiments, performed in simulation and the real world
+onboard a highly agile quadrotor platform, demonstrate the capabilities of the
+described system to run learned models with, previously infeasible, large
+modeling capacity using gradient-based online optimization MPC. Compared to
+prior implementations of neural networks in online optimization MPC we can
+leverage models of over 4000 times larger parametric capacity in a 50Hz
+real-time window on an embedded platform. Further, we show the feasibility of
+our framework on real-world problems by reducing the positional tracking error
+by up to 82% when compared to state-of-the-art MPC approaches without neural
+network dynamics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diversity Induced Environment Design via Self-Play 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02119v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02119v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dexun Li, Wenjun Li, Pradeep Varakantham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work on designing an appropriate distribution of environments has
+shown promise for training effective generally capable agents. Its success is
+partly because of a form of adaptive curriculum learning that generates
+environment instances (or levels) at the frontier of the agent's capabilities.
+However, such an environment design framework often struggles to find effective
+levels in challenging design spaces and requires costly interactions with the
+environment. In this paper, we aim to introduce diversity in the Unsupervised
+Environment Design (UED) framework. Specifically, we propose a task-agnostic
+method to identify observed/hidden states that are representative of a given
+level. The outcome of this method is then utilized to characterize the
+diversity between two levels, which as we show can be crucial to effective
+performance. In addition, to improve sampling efficiency, we incorporate the
+self-play technique that allows the environment generator to automatically
+generate environments that are of great benefit to the training agent.
+Quantitatively, our approach, Diversity-induced Environment Design via
+Self-Play (DivSP), shows compelling performance over existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ B2Opt: Learning to Optimize Black-box Optimization with Little Budget 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11787v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11787v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaobin Li, Kai Wu, Xiaoyu Zhang, Handing Wang, Jing Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The core challenge of high-dimensional and expensive black-box optimization
+(BBO) is how to obtain better performance faster with little function
+evaluation cost. The essence of the problem is how to design an efficient
+optimization strategy tailored to the target task. This paper designs a
+powerful optimization framework to automatically learn the optimization
+strategies from the target or cheap surrogate task without human intervention.
+However, current methods are weak for this due to poor representation of
+optimization strategy. To achieve this, 1) drawing on the mechanism of genetic
+algorithm, we propose a deep neural network framework called B2Opt, which has a
+stronger representation of optimization strategies based on survival of the
+fittest; 2) B2Opt can utilize the cheap surrogate functions of the target task
+to guide the design of the efficient optimization strategies. Compared to the
+state-of-the-art BBO baselines, B2Opt can achieve multiple orders of magnitude
+performance improvement with less function evaluation cost. We validate our
+proposal on high-dimensional synthetic functions and two real-world
+applications. We also find that deep B2Opt performs better than shallow ones.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sharp Restricted Isometry Property Bounds for Low-rank Matrix Recovery
+  Problems with Corrupted Measurements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.08232v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.08232v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziye Ma, Yingjie Bi, Javad Lavaei, Somayeh Sojoudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study a general low-rank matrix recovery problem with
+linear measurements corrupted by some noise. The objective is to understand
+under what conditions on the restricted isometry property (RIP) of the problem
+local search methods can find the ground truth with a small error. By analyzing
+the landscape of the non-convex problem, we first propose a global guarantee on
+the maximum distance between an arbitrary local minimizer and the ground truth
+under the assumption that the RIP constant is smaller than $1/2$. We show that
+this distance shrinks to zero as the intensity of the noise reduces. Our new
+guarantee is sharp in terms of the RIP constant and is much stronger than the
+existing results. We then present a local guarantee for problems with an
+arbitrary RIP constant, which states that any local minimizer is either
+considerably close to the ground truth or far away from it. Next, we prove the
+strict saddle property, which guarantees the global convergence of the
+perturbed gradient descent method in polynomial time. The developed results
+demonstrate how the noise intensity and the RIP constant of the problem affect
+the landscape of the problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dictionary Learning under Symmetries via Group Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19557v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19557v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Subhroshekhar Ghosh, Aaron Y. R. Low, Yong Sheng Soh, Zhuohang Feng, Brendan K. Y. Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The dictionary learning problem can be viewed as a data-driven process to
+learn a suitable transformation so that data is sparsely represented directly
+from example data. In this paper, we examine the problem of learning a
+dictionary that is invariant under a pre-specified group of transformations.
+Natural settings include Cryo-EM, multi-object tracking, synchronization, pose
+estimation, etc. We specifically study this problem under the lens of
+mathematical representation theory. Leveraging the power of non-abelian Fourier
+analysis for functions over compact groups, we prescribe an algorithmic recipe
+for learning dictionaries that obey such invariances. We relate the dictionary
+learning problem in the physical domain, which is naturally modelled as being
+infinite dimensional, with the associated computational problem, which is
+necessarily finite dimensional. We establish that the dictionary learning
+problem can be effectively understood as an optimization instance over certain
+matrix orbitopes having a particular block-diagonal structure governed by the
+irreducible representations of the group of symmetries. This perspective
+enables us to introduce a band-limiting procedure which obtains dimensionality
+reduction in applications. We provide guarantees for our computational ansatz
+to provide a desirable dictionary learning outcome. We apply our paradigm to
+investigate the dictionary learning problem for the groups SO(2) and SO(3).
+While the SO(2)-orbitope admits an exact spectrahedral description,
+substantially less is understood about the SO(3)-orbitope. We describe a
+tractable spectrahedral outer approximation of the SO(3)-orbitope, and
+contribute an alternating minimization paradigm to perform optimization in this
+setting. We provide numerical experiments to highlight the efficacy of our
+approach in learning SO(3)-invariant dictionaries, both on synthetic and on
+real world data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ROI: A method for identifying organizations receiving personal data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.09495v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.09495v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Rodriguez, Jose M. Del Alamo, Miguel Cozar, Boni Garcia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many studies have exposed the massive collection of personal data in the
+digital ecosystem through, for instance, websites, mobile apps, or smart
+devices. This fact goes unnoticed by most users, who are also unaware that the
+collectors are sharing their personal data with many different organizations
+around the globe. This paper assesses techniques available in the state of the
+art to identify the organizations receiving this personal data. Based on our
+findings, we propose ROI (Receiver Organization Identifier), a fully automated
+method that combines different techniques to achieve a 95.71% precision score
+in identifying an organization receiving personal data. We demonstrate our
+method in the wild by evaluating 10,000 Android apps and exposing the
+organizations that receive users' personal data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ One Explanation Does Not Fit XIL 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07136v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07136v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Friedrich, David Steinmann, Kristian Kersting
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current machine learning models produce outstanding results in many areas
+but, at the same time, suffer from shortcut learning and spurious correlations.
+To address such flaws, the explanatory interactive machine learning (XIL)
+framework has been proposed to revise a model by employing user feedback on a
+model's explanation. This work sheds light on the explanations used within this
+framework. In particular, we investigate simultaneous model revision through
+multiple explanation methods. To this end, we identified that \textit{one
+explanation does not fit XIL} and propose considering multiple ones when
+revising models via XIL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retentive Network: A Successor to <span class="highlight-title">Transformer</span> for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08621v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08621v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutao Sun, Li Dong, Shaohan Huang, Shuming Ma, Yuqing Xia, Jilong Xue, Jianyong Wang, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose Retentive Network (RetNet) as a foundation
+architecture for large language models, simultaneously achieving training
+parallelism, low-cost inference, and good performance. We theoretically derive
+the connection between recurrence and attention. Then we propose the retention
+mechanism for sequence modeling, which supports three computation paradigms,
+i.e., parallel, recurrent, and chunkwise recurrent. Specifically, the parallel
+representation allows for training parallelism. The recurrent representation
+enables low-cost $O(1)$ inference, which improves decoding throughput, latency,
+and GPU memory without sacrificing performance. The chunkwise recurrent
+representation facilitates efficient long-sequence modeling with linear
+complexity, where each chunk is encoded parallelly while recurrently
+summarizing the chunks. Experimental results on language modeling show that
+RetNet achieves favorable scaling results, parallel training, low-cost
+deployment, and efficient inference. The intriguing properties make RetNet a
+strong successor to Transformer for large language models. Code will be
+available at https://aka.ms/retnet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RegExplainer: Generating Explanations for Graph Neural Networks in
+  Regression Task 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07840v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07840v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxing Zhang, Zhuomin Chen, Hao Mei, Dongsheng Luo, Hua Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph regression is a fundamental task and has received increasing attention
+in a wide range of graph learning tasks. However, the inference process is
+often not interpretable. Most existing explanation techniques are limited to
+understanding GNN behaviors in classification tasks. In this work, we seek an
+explanation to interpret the graph regression models (XAIG-R). We show that
+existing methods overlook the distribution shifting and continuously ordered
+decision boundary, which hinders them away from being applied in the regression
+tasks. To address these challenges, we propose a novel objective based on the
+information bottleneck theory and introduce a new mix-up framework, which could
+support various GNNs in a model-agnostic manner. We further present a
+contrastive learning strategy to tackle the continuously ordered labels in
+regression task. To empirically verify the effectiveness of the proposed
+method, we introduce three benchmark datasets and a real-life dataset for
+evaluation. Extensive experiments show the effectiveness of the proposed method
+in interpreting GNN models in regression tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentially Private Distributed Estimation and Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15865v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15865v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marios Papachristou, M. Amin Rahimian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study distributed estimation and learning problems in a networked
+environment in which agents exchange information to estimate unknown
+statistical properties of random variables from their privately observed
+samples. By exchanging information about their private observations, the agents
+can collectively estimate the unknown quantities, but they also face privacy
+risks. The goal of our aggregation schemes is to combine the observed data
+efficiently over time and across the network, while accommodating the privacy
+needs of the agents and without any coordination beyond their local
+neighborhoods. Our algorithms enable the participating agents to estimate a
+complete sufficient statistic from private signals that are acquired offline or
+online over time, and to preserve the privacy of their signals and network
+neighborhoods. This is achieved through linear aggregation schemes with
+adjusted randomization schemes that add noise to the exchanged estimates
+subject to differential privacy (DP) constraints. In every case, we demonstrate
+the efficiency of our algorithms by proving convergence to the estimators of a
+hypothetical, omniscient observer that has central access to all of the
+signals. We also provide convergence rate analysis and finite-time performance
+guarantees and show that the noise that minimizes the convergence time to the
+best estimates is the Laplace noise, with parameters corresponding to each
+agent's sensitivity to their signal and network characteristics. Finally, to
+supplement and validate our theoretical results, we run experiments on
+real-world data from the US Power Grid Network and electric consumption data
+from German Households to estimate the average power consumption of power
+stations and households under all privacy regimes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online Learning Guided Curvature Approximation: A Quasi-Newton Method
+  with Global Non-Asymptotic Superlinear Convergence <span class="chip">COLT 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08580v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08580v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruichen Jiang, Qiujiang Jin, Aryan Mokhtari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quasi-Newton algorithms are among the most popular iterative methods for
+solving unconstrained minimization problems, largely due to their favorable
+superlinear convergence property. However, existing results for these
+algorithms are limited as they provide either (i) a global convergence
+guarantee with an asymptotic superlinear convergence rate, or (ii) a local
+non-asymptotic superlinear rate for the case that the initial point and the
+initial Hessian approximation are chosen properly. In particular, no current
+analysis for quasi-Newton methods guarantees global convergence with an
+explicit superlinear convergence rate. In this paper, we close this gap and
+present the first globally convergent quasi-Newton method with an explicit
+non-asymptotic superlinear convergence rate. Unlike classical quasi-Newton
+methods, we build our algorithm upon the hybrid proximal extragradient method
+and propose a novel online learning framework for updating the Hessian
+approximation matrices. Specifically, guided by the convergence analysis, we
+formulate the Hessian approximation update as an online convex optimization
+problem in the space of matrices, and we relate the bounded regret of the
+online problem to the superlinear convergence of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 1 figure, accepted to COLT 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Learning practices and infrastructures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06518v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06518v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Glen Berman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine Learning (ML) systems, particularly when deployed in high-stakes
+domains, are deeply consequential. They can exacerbate existing inequities,
+create new modes of discrimination, and reify outdated social constructs.
+Accordingly, the social context (i.e. organisations, teams, cultures) in which
+ML systems are developed is a site of active research for the field of AI
+ethics, and intervention for policymakers. This paper focuses on one aspect of
+social context that is often overlooked: interactions between practitioners and
+the tools they rely on, and the role these interactions play in shaping ML
+practices and the development of ML systems. In particular, through an
+empirical study of questions asked on the Stack Exchange forums, the use of
+interactive computing platforms (e.g. Jupyter Notebook and Google Colab) in ML
+practices is explored. I find that interactive computing platforms are used in
+a host of learning and coordination practices, which constitutes an
+infrastructural relationship between interactive computing platforms and ML
+practitioners. I describe how ML practices are co-evolving alongside the
+development of interactive computing platforms, and highlight how this risks
+making invisible aspects of the ML life cycle that AI ethics researchers' have
+demonstrated to be particularly salient for the societal impact of deployed ML
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Question Decomposition Improves the Faithfulness of Model-Generated
+  Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11768v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11768v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ansh Radhakrishnan, Karina Nguyen, Anna Chen, Carol Chen, Carson Denison, Danny Hernandez, Esin Durmus, Evan Hubinger, Jackson Kernion, Kamilė Lukošiūtė, Newton Cheng, Nicholas Joseph, Nicholas Schiefer, Oliver Rausch, Sam McCandlish, Sheer El Showk, Tamera Lanham, Tim Maxwell, Venkatesa Chandrasekaran, Zac Hatfield-Dodds, Jared Kaplan, Jan Brauner, Samuel R. Bowman, Ethan Perez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models (LLMs) perform more difficult tasks, it becomes
+harder to verify the correctness and safety of their behavior. One approach to
+help with this issue is to prompt LLMs to externalize their reasoning, e.g., by
+having them generate step-by-step reasoning as they answer a question
+(Chain-of-Thought; CoT). The reasoning may enable us to check the process that
+models use to perform tasks. However, this approach relies on the stated
+reasoning faithfully reflecting the model's actual reasoning, which is not
+always the case. To improve over the faithfulness of CoT reasoning, we have
+models generate reasoning by decomposing questions into subquestions.
+Decomposition-based methods achieve strong performance on question-answering
+tasks, sometimes approaching that of CoT while improving the faithfulness of
+the model's stated reasoning on several recently-proposed metrics. By forcing
+the model to answer simpler subquestions in separate contexts, we greatly
+increase the faithfulness of model-generated reasoning over CoT, while still
+achieving some of the performance gains of CoT. Our results show it is possible
+to improve the faithfulness of model-generated reasoning; continued
+improvements may lead to reasoning that enables us to verify the correctness
+and safety of LLM behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>For few-shot examples and prompts, see
+  https://github.com/anthropics/DecompositionFaithfulnessPaper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Network Revenue Management with Demand Learning and Fair
+  Resource-Consumption Balancing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.11159v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.11159v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Chen, Jiameng Lyu, Yining Wang, Yuan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In addition to maximizing the total revenue, decision-makers in lots of
+industries would like to guarantee balanced consumption across different
+resources. For instance, in the retailing industry, ensuring a balanced
+consumption of resources from different suppliers enhances fairness and helps
+main a good channel relationship; in the cloud computing industry,
+resource-consumption balance helps increase customer satisfaction and reduce
+operational costs. Motivated by these practical needs, this paper studies the
+price-based network revenue management (NRM) problem with both demand learning
+and fair resource-consumption balancing. We introduce the regularized revenue,
+i.e., the total revenue with a balancing regularization, as our objective to
+incorporate fair resource-consumption balancing into the revenue maximization
+goal. We propose a primal-dual-type online policy with the
+Upper-Confidence-Bound (UCB) demand learning method to maximize the regularized
+revenue. We adopt several innovative techniques to make our algorithm a unified
+and computationally efficient framework for the continuous price set and a wide
+class of balancing regularizers. Our algorithm achieves a worst-case regret of
+$\widetilde O(N^{5/2}\sqrt{T})$, where $N$ denotes the number of products and
+$T$ denotes the number of time periods. Numerical experiments in a few NRM
+examples demonstrate the effectiveness of our algorithm in simultaneously
+achieving revenue maximization and fair resource-consumption balancing
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The original title is Fairness-aware Network Revenue Management With
+  Demand Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multifidelity Covariance Estimation via Regression on the Manifold of
+  Symmetric Positive Definite Matrices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12438v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12438v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aimee Maurais, Terrence Alsup, Benjamin Peherstorfer, Youssef Marzouk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a multifidelity estimator of covariance matrices formulated as
+the solution to a regression problem on the manifold of symmetric positive
+definite matrices. The estimator is positive definite by construction, and the
+Mahalanobis distance minimized to obtain it possesses properties which enable
+practical computation. We show that our manifold regression multifidelity
+(MRMF) covariance estimator is a maximum likelihood estimator under a certain
+error model on manifold tangent space. More broadly, we show that our
+Riemannian regression framework encompasses existing multifidelity covariance
+estimators constructed from control variates. We demonstrate via numerical
+examples that our estimator can provide significant decreases, up to one order
+of magnitude, in squared estimation error relative to both single-fidelity and
+other multifidelity covariance estimators. Furthermore, preservation of
+positive definiteness ensures that our estimator is compatible with downstream
+tasks, such as data assimilation and metric learning, in which this property is
+essential.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages + 15-page supplement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Finite volume method network for acceleration of unsteady computational
+  fluid dynamics: non-reacting and reacting flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.03332v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.03332v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joongoo Jeon, Juhyeong Lee, Sung Joong Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite rapid improvements in the performance of central processing unit
+(CPU), the calculation cost of simulating chemically reacting flow using CFD
+remains infeasible in many cases. The application of the convolutional neural
+networks (CNNs) specialized in image processing in flow field prediction has
+been studied, but the need to develop a neural netweork design fitted for CFD
+is recently emerged. In this study, a neural network model introducing the
+finite volume method (FVM) with a unique network architecture and
+physics-informed loss function was developed to accelerate CFD simulations. The
+developed network model, considering the nature of the CFD flow field where the
+identical governing equations are applied to all grids, can predict the future
+fields with only two previous fields unlike the CNNs requiring many field
+images (>10,000). The performance of this baseline model was evaluated using
+CFD time series data from non-reacting flow and reacting flow simulation;
+counterflow and hydrogen flame with 20 detailed chemistries. Consequently, we
+demonstrated that (1) the FVM-based network architecture provided improved
+accuracy of multistep time series prediction compared to the previous MLP model
+(2) the physic-informed loss function prevented non-physical overfitting
+problem and ultimately reduced the error in time series prediction (3)
+observing the calculated residuals in an unsupervised manner could indirectly
+estimate the network accuracy. Additionally, under the reacting flow dataset,
+the computational speed of this network model was measured to be about 10 times
+faster than that of the CFD solver.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatial-temporal <span class="highlight-title">Transformer</span>-guided Diffusion based Data Augmentation
+  for Efficient Skeleton-based Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13434v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13434v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Jiang, Han Chen, Hanseok Ko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, skeleton-based human action has become a hot research topic because
+the compact representation of human skeletons brings new blood to this research
+domain. As a result, researchers began to notice the importance of using RGB or
+other sensors to analyze human action by extracting skeleton information.
+Leveraging the rapid development of deep learning (DL), a significant number of
+skeleton-based human action approaches have been presented with fine-designed
+DL structures recently. However, a well-trained DL model always demands
+high-quality and sufficient data, which is hard to obtain without costing high
+expenses and human labor. In this paper, we introduce a novel data augmentation
+method for skeleton-based action recognition tasks, which can effectively
+generate high-quality and diverse sequential actions. In order to obtain
+natural and realistic action sequences, we propose denoising diffusion
+probabilistic models (DDPMs) that can generate a series of synthetic action
+sequences, and their generation process is precisely guided by a
+spatial-temporal transformer (ST-Trans). Experimental results show that our
+method outperforms the state-of-the-art (SOTA) motion generation approaches on
+different naturality and diversity metrics. It proves that its high-quality
+synthetic data can also be effectively deployed to existing action recognition
+models with significant performance improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What Symptoms and How Long? An Interpretable AI Approach for Depression
+  Detection in Social Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13127v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13127v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junwei Kuang, Jiaheng Xie, Zhijun Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depression is the most prevalent and serious mental illness, which induces
+grave financial and societal ramifications. Depression detection is key for
+early intervention to mitigate those consequences. Such a high-stake decision
+inherently necessitates interpretability. Although a few depression detection
+studies attempt to explain the decision based on the importance score or
+attention weights, these explanations misalign with the clinical depression
+diagnosis criterion that is based on depressive symptoms. To fill this gap, we
+follow the computational design science paradigm to develop a novel Multi-Scale
+Temporal Prototype Network (MSTPNet). MSTPNet innovatively detects and
+interprets depressive symptoms as well as how long they last. Extensive
+empirical analyses using a large-scale dataset show that MSTPNet outperforms
+state-of-the-art depression detection methods with an F1-score of 0.851. This
+result also reveals new symptoms that are unnoted in the survey approach, such
+as sharing admiration for a different life. We further conduct a user study to
+demonstrate its superiority over the benchmarks in interpretability. This study
+contributes to IS literature with a novel interpretable deep learning model for
+depression detection in social media. In practice, our proposed method can be
+implemented in social media platforms to provide personalized online resources
+for detected depressed patients.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>56 pages, 10 figures, 21 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FairGen: Towards Fair Graph Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17743v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17743v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lecheng Zheng, Dawei Zhou, Hanghang Tong, Jiejun Xu, Yada Zhu, Jingrui He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There have been tremendous efforts over the past decades dedicated to the
+generation of realistic graphs in a variety of domains, ranging from social
+networks to computer networks, from gene regulatory networks to online
+transaction networks. Despite the remarkable success, the vast majority of
+these works are unsupervised in nature and are typically trained to minimize
+the expected graph reconstruction loss, which would result in the
+representation disparity issue in the generated graphs, i.e., the protected
+groups (often minorities) contribute less to the objective and thus suffer from
+systematically higher errors. In this paper, we aim to tailor graph generation
+to downstream mining tasks by leveraging label information and user-preferred
+parity constraint. In particular, we start from the investigation of
+representation disparity in the context of graph generative models. To mitigate
+the disparity, we propose a fairness-aware graph generative model named
+FairGen. Our model jointly trains a label-informed graph generation module and
+a fair representation learning module by progressively learning the behaviors
+of the protected and unprotected groups, from the `easy' concepts to the `hard'
+ones. In addition, we propose a generic context sampling strategy for graph
+generative models, which is proven to be capable of fairly capturing the
+contextual information of each group with a high probability. Experimental
+results on seven real-world data sets, including web-based graphs, demonstrate
+that FairGen (1) obtains performance on par with state-of-the-art graph
+generative models across six network properties, (2) mitigates the
+representation disparity issues in the generated graphs, and (3) substantially
+boosts the model performance by up to 17% in downstream tasks via data
+augmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Concept Algebra for Score-Based Conditional Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.03693v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.03693v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Wang, Lin Gui, Jeffrey Negrea, Victor Veitch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper concerns the structure of learned representations in text-guided
+generative models, focusing on score-based models. Here, we focus on the idea
+that concepts are encoded as subspaces (or directions) of some representation
+space. We develop a mathematical formalization of this idea.Using this
+formalism, we show there's a natural choice of representation with this
+property, and we develop a simple method for identifying the part of the
+representation corresponding to a given concept. In particular, this allows us
+to manipulate the concepts expressed by the model through algebraic
+manipulation of the representation. We demonstrate the idea with examples
+text-guided image generation, using Stable Diffusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Linear CNNs Discover the Statistical Structure of the <span class="highlight-title">Dataset</span> Using Only
+  the Most Dominant Frequencies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.02034v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.02034v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hannah Pinson, Joeri Lenaerts, Vincent Ginis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We here present a stepping stone towards a deeper understanding of
+convolutional neural networks (CNNs) in the form of a theory of learning in
+linear CNNs. Through analyzing the gradient descent equations, we discover that
+the evolution of the network during training is determined by the interplay
+between the dataset structure and the convolutional network structure. We show
+that linear CNNs discover the statistical structure of the dataset with
+non-linear, ordered, stage-like transitions, and that the speed of discovery
+changes depending on the relationship between the dataset and the convolutional
+network structure. Moreover, we find that this interplay lies at the heart of
+what we call the ``dominant frequency bias'', where linear CNNs arrive at these
+discoveries using only the dominant frequencies of the different structural
+parts present in the dataset. We furthermore provide experiments that show how
+our theory relates to deep, non-linear CNNs used in practice. Our findings shed
+new light on the inner working of CNNs, and can help explain their shortcut
+learning and their tendency to rely on texture instead of shape.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Manifold Filter-Combine Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04056v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04056v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joyce Chew, Edward De Brouwer, Smita Krishnaswamy, Deanna Needell, Michael Perlmutter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a class of manifold neural networks (MNNs) that we call Manifold
+Filter-Combine Networks (MFCNs), that aims to further our understanding of
+MNNs, analogous to how the aggregate-combine framework helps with the
+understanding of graph neural networks (GNNs). This class includes a wide
+variety of subclasses that can be thought of as the manifold analog of various
+popular GNNs. We then consider a method, based on building a data-driven graph,
+for implementing such networks when one does not have global knowledge of the
+manifold, but merely has access to finitely many sample points. We provide
+sufficient conditions for the network to provably converge to its continuum
+limit as the number of sample points tends to infinity. Unlike previous work
+(which focused on specific graph constructions), our rate of convergence does
+not directly depend on the number of filters used. Moreover, it exhibits linear
+dependence on the depth of the network rather than the exponential dependence
+obtained previously. Additionally, we provide several examples of interesting
+subclasses of MFCNs and of the rates of convergence that are obtained under
+specific graph constructions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Productive Crop Field Detection: A New <span class="highlight-title">Dataset</span> and Deep Learning
+  Benchmark Results 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11990v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11990v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduardo Nascimento, John Just, Jurandy Almeida, Tiago Almeida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In precision agriculture, detecting productive crop fields is an essential
+practice that allows the farmer to evaluate operating performance separately
+and compare different seed varieties, pesticides, and fertilizers. However,
+manually identifying productive fields is often a time-consuming and
+error-prone task. Previous studies explore different methods to detect crop
+fields using advanced machine learning algorithms, but they often lack good
+quality labeled data. In this context, we propose a high-quality dataset
+generated by machine operation combined with Sentinel-2 images tracked over
+time. As far as we know, it is the first one to overcome the lack of labeled
+samples by using this technique. In sequence, we apply a semi-supervised
+classification of unlabeled data and state-of-the-art supervised and
+self-supervised deep learning methods to detect productive crop fields
+automatically. Finally, the results demonstrate high accuracy in Positive
+Unlabeled learning, which perfectly fits the problem where we have high
+confidence in the positive samples. Best performances have been found in
+Triplet Loss Siamese given the existence of an accurate dataset and Contrastive
+Learning considering situations where we do not have a comprehensive labeled
+dataset available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of the paper https://doi.org/10.1109/lgrs.2023.3296064
+  published in IEEE Geoscience and Remote Sensing Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SimFBO: Towards Simple, Flexible and Communication-efficient Federated
+  Bilevel Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19442v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19442v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Yang, Peiyao Xiao, Kaiyi Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated bilevel optimization (FBO) has shown great potential recently in
+machine learning and edge computing due to the emerging nested optimization
+structure in meta-learning, fine-tuning, hyperparameter tuning, etc. However,
+existing FBO algorithms often involve complicated computations and require
+multiple sub-loops per iteration, each of which contains a number of
+communication rounds. In this paper, we propose a simple and flexible FBO
+framework named SimFBO, which is easy to implement without sub-loops, and
+includes a generalized server-side aggregation and update for improving
+communication efficiency. We further propose System-level heterogeneity robust
+FBO (ShroFBO) as a variant of SimFBO with stronger resilience to heterogeneous
+local computation. We show that SimFBO and ShroFBO provably achieve a linear
+convergence speedup with partial client participation and client sampling
+without replacement, as well as improved sample and communication complexities.
+Experiments demonstrate the effectiveness of the proposed methods over existing
+FBO algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Monotonicity and Double Descent in Uncertainty Estimation with Gaussian
+  Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.07612v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.07612v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liam Hodgkinson, Chris van der Heide, Fred Roosta, Michael W. Mahoney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite their importance for assessing reliability of predictions,
+uncertainty quantification (UQ) measures for machine learning models have only
+recently begun to be rigorously characterized. One prominent issue is the curse
+of dimensionality: it is commonly believed that the marginal likelihood should
+be reminiscent of cross-validation metrics and that both should deteriorate
+with larger input dimensions. We prove that by tuning hyperparameters to
+maximize marginal likelihood (the empirical Bayes procedure), the performance,
+as measured by the marginal likelihood, improves monotonically} with the input
+dimension. On the other hand, we prove that cross-validation metrics exhibit
+qualitatively different behavior that is characteristic of double descent. Cold
+posteriors, which have recently attracted interest due to their improved
+performance in certain settings, appear to exacerbate these phenomena. We
+verify empirically that our results hold for real data, beyond our considered
+assumptions, and we explore consequences involving synthetic covariates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learn to Compress (LtC): Efficient Learning-based Streaming Video
+  Analytics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12171v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12171v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quazi Mishkatul Alam, Israat Haque, Nael Abu-Ghazaleh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video analytics are often performed as cloud services in edge settings,
+mainly to offload computation, and also in situations where the results are not
+directly consumed at the video sensors. Sending high-quality video data from
+the edge devices can be expensive both in terms of bandwidth and power use. In
+order to build a streaming video analytics pipeline that makes efficient use of
+these resources, it is therefore imperative to reduce the size of the video
+stream. Traditional video compression algorithms are unaware of the semantics
+of the video, and can be both inefficient and harmful for the analytics
+performance. In this paper, we introduce LtC, a collaborative framework between
+the video source and the analytics server, that efficiently learns to reduce
+the video streams within an analytics pipeline. Specifically, LtC uses the
+full-fledged analytics algorithm at the server as a teacher to train a
+lightweight student neural network, which is then deployed at the video source.
+The student network is trained to comprehend the semantic significance of
+various regions within the videos, which is used to differentially preserve the
+crucial regions in high quality while the remaining regions undergo aggressive
+compression. Furthermore, LtC also incorporates a novel temporal filtering
+algorithm based on feature-differencing to omit transmitting frames that do not
+contribute new information. Overall, LtC is able to use 28-35% less bandwidth
+and has up to 45% shorter response delay compared to recently published state
+of the art streaming frameworks while achieving similar analytics performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge Distillation-based Information Sharing for Online Process
+  Monitoring in Decentralized Manufacturing System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12004v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12004v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhangyue Shi, Yuxuan Li, Chenang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In advanced manufacturing, the incorporation of sensing technology provides
+an opportunity to achieve efficient in-situ process monitoring using machine
+learning methods. Meanwhile, the advances of information technologies also
+enable a connected and decentralized environment for manufacturing systems,
+making different manufacturing units in the system collaborate more closely. In
+a decentralized manufacturing system, the involved units may fabricate same or
+similar products and deploy their own machine learning model for online process
+monitoring. However, due to the possible inconsistency of task progress during
+the operation, it is also common that some units have more informative data
+while some have less informative data. Thus, the monitoring performance of
+machine learning model for each unit may highly vary. Therefore, it is
+extremely valuable to achieve efficient and secured knowledge sharing among the
+units in a decentralized manufacturing system for enhancement of poorly
+performed models. To realize this goal, this paper proposes a novel knowledge
+distillation-based information sharing (KD-IS) framework, which could distill
+informative knowledge from well performed models to improve the monitoring
+performance of poorly performed models. To validate the effectiveness of this
+method, a real-world case study is conducted in a connected fused filament
+fabrication (FFF)-based additive manufacturing (AM) platform. The experimental
+results show that the developed method is very efficient in improving model
+monitoring performance at poorly performed models, with solid protection on
+potential data privacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">4</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spectrum-guided Multi-granularity Referring Video Object Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13537v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13537v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Miao, Mohammed Bennamoun, Yongsheng Gao, Ajmal Mian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current referring video object segmentation (R-VOS) techniques extract
+conditional kernels from encoded (low-resolution) vision-language features to
+segment the decoded high-resolution features. We discovered that this causes
+significant feature drift, which the segmentation kernels struggle to perceive
+during the forward computation. This negatively affects the ability of
+segmentation kernels. To address the drift problem, we propose a
+Spectrum-guided Multi-granularity (SgMg) approach, which performs direct
+segmentation on the encoded features and employs visual details to further
+optimize the masks. In addition, we propose Spectrum-guided Cross-modal Fusion
+(SCF) to perform intra-frame global interactions in the spectral domain for
+effective multimodal representation. Finally, we extend SgMg to perform
+multi-object R-VOS, a new paradigm that enables simultaneous segmentation of
+multiple referred objects in a video. This not only makes R-VOS faster, but
+also more practical. Extensive experiments show that SgMg achieves
+state-of-the-art performance on four video benchmark datasets, outperforming
+the nearest competitor by 2.8% points on Ref-YouTube-VOS. Our extended SgMg
+enables multi-object R-VOS, runs about 3 times faster while maintaining
+satisfactory performance. Code is available at https://github.com/bo-miao/SgMg.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023, code is at https://github.com/bo-miao/SgMg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Snoring Sound <span class="highlight-title">Dataset</span> for Body Position Recognition: Collection,
+  Annotation, and Analysis <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13346v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13346v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Xiao, Xiuping Yang, Xinhong Li, Weiping Tu, Xiong Chen, Weiyan Yi, Jie Lin, Yuhong Yang, Yanzhen Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obstructive Sleep Apnea-Hypopnea Syndrome (OSAHS) is a chronic breathing
+disorder caused by a blockage in the upper airways. Snoring is a prominent
+symptom of OSAHS, and previous studies have attempted to identify the
+obstruction site of the upper airways by snoring sounds. Despite some progress,
+the classification of the obstruction site remains challenging in real-world
+clinical settings due to the influence of sleep body position on upper airways.
+To address this challenge, this paper proposes a snore-based sleep body
+position recognition dataset (SSBPR) consisting of 7570 snoring recordings,
+which comprises six distinct labels for sleep body position: supine, supine but
+left lateral head, supine but right lateral head, left-side lying, right-side
+lying and prone. Experimental results show that snoring sounds exhibit certain
+acoustic features that enable their effective utilization for identifying body
+posture during sleep in real-world scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INTERSPEECH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio-aware Query-enhanced <span class="highlight-title">Transformer</span> for Audio-Visual Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinxiang Liu, Chen Ju, Chaofan Ma, Yanfeng Wang, Yu Wang, Ya Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of the audio-visual segmentation (AVS) task is to segment the
+sounding objects in the video frames using audio cues. However, current
+fusion-based methods have the performance limitations due to the small
+receptive field of convolution and inadequate fusion of audio-visual features.
+To overcome these issues, we propose a novel \textbf{Au}dio-aware
+query-enhanced \textbf{TR}ansformer (AuTR) to tackle the task. Unlike existing
+methods, our approach introduces a multimodal transformer architecture that
+enables deep fusion and aggregation of audio-visual features. Furthermore, we
+devise an audio-aware query-enhanced transformer decoder that explicitly helps
+the model focus on the segmentation of the pinpointed sounding objects based on
+audio signals, while disregarding silent yet salient objects. Experimental
+results show that our method outperforms previous methods and demonstrates
+better generalization ability in multi-sound and open-set scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2305.11019</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-oriented Modality Reinforcement Network for Multimodal Sentiment
+  Analysis from Unaligned Multimodal Sequences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13205v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13205v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Lei, Dingkang Yang, Mingcheng Li, Shunli Wang, Jiawei Chen, Lihua Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Sentiment Analysis (MSA) aims to mine sentiment information from
+text, visual, and acoustic modalities. Previous works have focused on
+representation learning and feature fusion strategies. However, most of these
+efforts ignored the disparity in the semantic richness of different modalities
+and treated each modality in the same manner. That may lead to strong
+modalities being neglected and weak modalities being overvalued. Motivated by
+these observations, we propose a Text-oriented Modality Reinforcement Network
+(TMRN), which focuses on the dominance of the text modality in MSA. More
+specifically, we design a Text-Centered Cross-modal Attention (TCCA) module to
+make full interaction for text/acoustic and text/visual pairs, and a Text-Gated
+Self-Attention (TGSA) module to guide the self-reinforcement of the other two
+modalities. Furthermore, we present an adaptive fusion mechanism to decide the
+proportion of different modalities involved in the fusion process. Finally, we
+combine the feature matrices into vectors to get the final representation for
+the downstream tasks. Experimental results show that our TMRN outperforms the
+state-of-the-art methods on two MSA benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CICAI 2023 (Finalist of Best Student Paper Award)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-07-24T00:00:00Z">2023-07-24</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">52</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D-LLM: Injecting the 3D World into Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yining Hong, Haoyu Zhen, Peihao Chen, Shuhong Zheng, Yilun Du, Zhenfang Chen, Chuang Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) and Vision-Language Models (VLMs) have been
+proven to excel at multiple tasks, such as commonsense reasoning. Powerful as
+these models can be, they are not grounded in the 3D physical world, which
+involves richer concepts such as spatial relationships, affordances, physics,
+layout, and so on. In this work, we propose to inject the 3D world into large
+language models and introduce a whole new family of 3D-LLMs. Specifically,
+3D-LLMs can take 3D point clouds and their features as input and perform a
+diverse set of 3D-related tasks, including captioning, dense captioning, 3D
+question answering, task decomposition, 3D grounding, 3D-assisted dialog,
+navigation, and so on. Using three types of prompting mechanisms that we
+design, we are able to collect over 300k 3D-language data covering these tasks.
+To efficiently train 3D-LLMs, we first utilize a 3D feature extractor that
+obtains 3D features from rendered multi- view images. Then, we use 2D VLMs as
+our backbones to train our 3D-LLMs. By introducing a 3D localization mechanism,
+3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show
+that our model outperforms state-of-the-art baselines by a large margin (e.g.,
+the BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore,
+experiments on our held-in datasets for 3D captioning, task composition, and
+3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative
+examples also show that our model could perform more tasks beyond the scope of
+existing LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: : https://vis-www.cs.umass.edu/3dllm/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the Ripple Effects of Knowledge Editing in Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12976v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12976v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roi Cohen, Eden Biran, Ori Yoran, Amir Globerson, Mor Geva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern language models capture a large body of factual knowledge. However,
+some facts can be incorrectly induced or become obsolete over time, resulting
+in factually incorrect generations. This has led to the development of various
+editing methods that allow updating facts encoded by the model. Evaluation of
+these methods has primarily focused on testing whether an individual fact has
+been successfully injected, and if similar predictions for other subjects have
+not changed. Here we argue that such evaluation is limited, since injecting one
+fact (e.g. ``Jack Depp is the son of Johnny Depp'') introduces a ``ripple
+effect'' in the form of additional facts that the model needs to update
+(e.g.``Jack Depp is the sibling of Lily-Rose Depp''). To address this issue, we
+propose a novel set of evaluation criteria that consider the implications of an
+edit on related facts. Using these criteria, we then construct \ripple{}, a
+diagnostic benchmark of 5K factual edits, capturing a variety of types of
+ripple effects. We evaluate prominent editing methods on \ripple{}, showing
+that current methods fail to introduce consistent changes in the model's
+knowledge. In addition, we find that a simple in-context editing baseline
+obtains the best scores on our benchmark, suggesting a promising research
+direction for model editing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Label Variation in Large Language Models for Zero-Shot Text
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12973v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12973v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Flor Miriam Plaza-del-Arco, Debora Nozza, Dirk Hovy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The zero-shot learning capabilities of large language models (LLMs) make them
+ideal for text classification without annotation or supervised training. Many
+studies have shown impressive results across multiple tasks. While tasks, data,
+and results differ widely, their similarities to human annotation can aid us in
+tackling new tasks with minimal expenses. We evaluate using 5 state-of-the-art
+LLMs as "annotators" on 5 different tasks (age, gender, topic, sentiment
+prediction, and hate speech detection), across 4 languages: English, French,
+German, and Spanish. No single model excels at all tasks, across languages, or
+across all labels within a task. However, aggregation techniques designed for
+human annotators perform substantially better than any one individual model.
+Overall, though, LLMs do not rival even simple supervised models, so they do
+not (yet) replace the need for human annotation. We also discuss the tradeoffs
+between speed, accuracy, cost, and bias when it comes to aggregated model
+labeling versus human annotation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aligning Large Language Models with Human: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12966v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12966v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufei Wang, Wanjun Zhong, Liangyou Li, Fei Mi, Xingshan Zeng, Wenyong Huang, Lifeng Shang, Xin Jiang, Qun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) trained on extensive textual corpora have
+emerged as leading solutions for a broad array of Natural Language Processing
+(NLP) tasks. Despite their notable performance, these models are prone to
+certain limitations such as misunderstanding human instructions, generating
+potentially biased content, or factually incorrect (hallucinated) information.
+Hence, aligning LLMs with human expectations has become an active area of
+interest within the research community. This survey presents a comprehensive
+overview of these alignment technologies, including the following aspects. (1)
+Data collection: the methods for effectively collecting high-quality
+instructions for LLM alignment, including the use of NLP benchmarks, human
+annotations, and leveraging strong LLMs. (2) Training methodologies: a detailed
+review of the prevailing training methods employed for LLM alignment. Our
+exploration encompasses Supervised Fine-tuning, both Online and Offline human
+preference training, along with parameter-efficient training mechanisms. (3)
+Model Evaluation: the methods for evaluating the effectiveness of these
+human-aligned LLMs, presenting a multifaceted approach towards their
+assessment. In conclusion, we collate and distill our findings, shedding light
+on several promising future research avenues in the field. This survey,
+therefore, serves as a valuable resource for anyone invested in understanding
+and advancing the alignment of LLMs to better suit human-oriented tasks and
+expectations. An associated GitHub link collecting the latest papers is
+available at https://github.com/GaryYufei/AlignLLMHumanSurvey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RLCD: Reinforcement Learning from Contrast Distillation for Language
+  Model Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Yang, Dan Klein, Asli Celikyilmaz, Nanyun Peng, Yuandong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Reinforcement Learning from Contrast Distillation (RLCD), a method
+for aligning language models to follow natural language principles without
+using human feedback. RLCD trains a preference model using simulated preference
+pairs that contain both a high-quality and low-quality example, generated using
+contrasting positive and negative prompts. The preference model is then used to
+improve a base unaligned language model via reinforcement learning.
+Empirically, RLCD outperforms RLAIF (Bai et al., 2022b) and context
+distillation (Huang et al., 2022) baselines across three diverse alignment
+tasks--harmlessness, helpfulness, and story outline generation--and on both 7B
+and 30B model scales for preference data simulation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Punctuation Restoration with Data Generation and Reinforcement
+  Learning <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viet Dac Lai, Abel Salinas, Hao Tan, Trung Bui, Quan Tran, Seunghyun Yoon, Hanieh Deilamsalehy, Franck Dernoncourt, Thien Huu Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Punctuation restoration is an important task in automatic speech recognition
+(ASR) which aim to restore the syntactic structure of generated ASR texts to
+improve readability. While punctuated texts are abundant from written
+documents, the discrepancy between written punctuated texts and ASR texts
+limits the usability of written texts in training punctuation restoration
+systems for ASR texts. This paper proposes a reinforcement learning method to
+exploit in-topic written texts and recent advances in large pre-trained
+generative language models to bridge this gap. The experiments show that our
+method achieves state-of-the-art performance on the ASR test set on two
+benchmark datasets for punctuation restoration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at INTERSPEECH 2023, 6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rule By Example: Harnessing Logical Rules for Explainable Hate Speech
+  Detection <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12935v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12935v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Clarke, Matthew Hall, Gaurav Mittal, Ye Yu, Sandra Sajeev, Jason Mars, Mei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classic approaches to content moderation typically apply a rule-based
+heuristic approach to flag content. While rules are easily customizable and
+intuitive for humans to interpret, they are inherently fragile and lack the
+flexibility or robustness needed to moderate the vast amount of undesirable
+content found online today. Recent advances in deep learning have demonstrated
+the promise of using highly effective deep neural models to overcome these
+challenges. However, despite the improved performance, these data-driven models
+lack transparency and explainability, often leading to mistrust from everyday
+users and a lack of adoption by many platforms. In this paper, we present Rule
+By Example (RBE): a novel exemplar-based contrastive learning approach for
+learning from logical rules for the task of textual content moderation. RBE is
+capable of providing rule-grounded predictions, allowing for more explainable
+and customizable predictions compared to typical deep learning-based
+approaches. We demonstrate that our approach is capable of learning rich rule
+embedding representations using only a few data examples. Experimental results
+on 3 popular hate speech classification datasets show that RBE is able to
+outperform state-of-the-art deep learning classifiers as well as the use of
+rules in both supervised and unsupervised settings while providing explainable
+model predictions via rule-grounding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Corrections of Zipf's and Heaps' Laws Derived from Hapax Rate Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12896v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12896v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Łukasz Dębowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The article introduces corrections to Zipf's and Heaps' laws based on
+systematic models of the hapax rate. The derivation rests on two assumptions:
+The first one is the standard urn model which predicts that marginal frequency
+distributions for shorter texts look as if word tokens were sampled blindly
+from a given longer text. The second assumption posits that the rate of hapaxes
+is a simple function of the text size. Four such functions are discussed: the
+constant model, the Davis model, the linear model, and the logistic model. It
+is shown that the logistic model yields the best fit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages, 7 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Real-World WebAgent with Planning, Long Context Understanding, and
+  Program Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12856v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12856v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Izzeddin Gur, Hiroki Furuta, Austin Huang, Mustafa Safdari, Yutaka Matsuo, Douglas Eck, Aleksandra Faust
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained large language models (LLMs) have recently achieved better
+generalization and sample efficiency in autonomous web navigation. However, the
+performance on real-world websites has still suffered from (1) open domainness,
+(2) limited context length, and (3) lack of inductive bias on HTML. We
+introduce WebAgent, an LLM-driven agent that can complete the tasks on real
+websites following natural language instructions. WebAgent plans ahead by
+decomposing instructions into canonical sub-instructions, summarizes long HTML
+documents into task-relevant snippets, and acts on websites via generated
+Python programs from those. We design WebAgent with Flan-U-PaLM, for grounded
+code generation, and HTML-T5, new pre-trained LLMs for long HTML documents
+using local and global attention mechanisms and a mixture of long-span
+denoising objectives, for planning and summarization. We empirically
+demonstrate that our recipe improves the success on a real website by over 50%,
+and that HTML-T5 is the best model to solve HTML-based tasks; achieving 14.9%
+higher success rate than prior SoTA on the MiniWoB web navigation benchmark and
+better accuracy on offline task planning evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Joint Dropout: Improving Generalizability in Low-Resource Neural Machine
+  Translation through Phrase Pair Variables 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Araabi, Vlad Niculae, Christof Monz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the tremendous success of Neural Machine Translation (NMT), its
+performance on low-resource language pairs still remains subpar, partly due to
+the limited ability to handle previously unseen inputs, i.e., generalization.
+In this paper, we propose a method called Joint Dropout, that addresses the
+challenge of low-resource neural machine translation by substituting phrases
+with variables, resulting in significant enhancement of compositionality, which
+is a key aspect of generalization. We observe a substantial improvement in
+translation quality for language pairs with minimal resources, as seen in BLEU
+and Direct Assessment scores. Furthermore, we conduct an error analysis, and
+find Joint Dropout to also enhance generalizability of low-resource NMT in
+terms of robustness and adaptability across different domains
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MT Summit 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Guidance in Radiology Report Summarization: An Empirical Evaluation and
+  Error Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12803v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12803v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Trienes, Paul Youssef, Jörg Schlötterer, Christin Seifert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatically summarizing radiology reports into a concise impression can
+reduce the manual burden of clinicians and improve the consistency of
+reporting. Previous work aimed to enhance content selection and factuality
+through guided abstractive summarization. However, two key issues persist.
+First, current methods heavily rely on domain-specific resources to extract the
+guidance signal, limiting their transferability to domains and languages where
+those resources are unavailable. Second, while automatic metrics like ROUGE
+show progress, we lack a good understanding of the errors and failure modes in
+this task. To bridge these gaps, we first propose a domain-agnostic guidance
+signal in form of variable-length extractive summaries. Our empirical results
+on two English benchmarks demonstrate that this guidance signal improves upon
+unguided summarization while being competitive with domain-specific methods.
+Additionally, we run an expert evaluation of four systems according to a
+taxonomy of 11 fine-grained errors. We find that the most pressing differences
+between automatic summaries and those of radiologists relate to content
+selection including omissions (up to 52%) and additions (up to 57%). We
+hypothesize that latent reporting factors and corpus-level inconsistencies may
+limit models to reliably learn content selection from the available data,
+presenting promising directions for future work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at INLG2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RRAML: Reinforced Retrieval Augmented Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12798v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12798v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Bacciu, Florin Cocunasu, Federico Siciliano, Fabrizio Silvestri, Nicola Tonellotto, Giovanni Trappolini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large language models (LLMs) has revolutionized machine
+learning and related fields, showcasing remarkable abilities in comprehending,
+generating, and manipulating human language. However, their conventional usage
+through API-based text prompt submissions imposes certain limitations in terms
+of context constraints and external source availability. To address these
+challenges, we propose a novel framework called Reinforced Retrieval Augmented
+Machine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs
+with supporting information retrieved by a purpose-built retriever from a vast
+user-provided database. By leveraging recent advancements in reinforcement
+learning, our method effectively addresses several critical challenges.
+Firstly, it circumvents the need for accessing LLM gradients. Secondly, our
+method alleviates the burden of retraining LLMs for specific tasks, as it is
+often impractical or impossible due to restricted access to the model and the
+computational intensity involved. Additionally we seamlessly link the
+retriever's task with the reasoner, mitigating hallucinations and reducing
+irrelevant, and potentially damaging retrieved documents. We believe that the
+research agenda outlined in this paper has the potential to profoundly impact
+the field of AI, democratizing access to and utilization of LLMs for a wide
+range of entities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Code-Switched Urdu ASR for Noisy Telephonic Environment using Data
+  Centric Approach with Hybrid HMM and CNN-TDNN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Danyal Khan, Raheem Ali, Arshad Aziz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Call Centers have huge amount of audio data which can be used for achieving
+valuable business insights and transcription of phone calls is manually tedious
+task. An effective Automated Speech Recognition system can accurately
+transcribe these calls for easy search through call history for specific
+context and content allowing automatic call monitoring, improving QoS through
+keyword search and sentiment analysis. ASR for Call Center requires more
+robustness as telephonic environment are generally noisy. Moreover, there are
+many low-resourced languages that are on verge of extinction which can be
+preserved with help of Automatic Speech Recognition Technology. Urdu is the
+$10^{th}$ most widely spoken language in the world, with 231,295,440 worldwide
+still remains a resource constrained language in ASR. Regional call-center
+conversations operate in local language, with a mix of English numbers and
+technical terms generally causing a "code-switching" problem. Hence, this paper
+describes an implementation framework of a resource efficient Automatic Speech
+Recognition/ Speech to Text System in a noisy call-center environment using
+Chain Hybrid HMM and CNN-TDNN for Code-Switched Urdu Language. Using Hybrid
+HMM-DNN approach allowed us to utilize the advantages of Neural Network with
+less labelled data. Adding CNN with TDNN has shown to work better in noisy
+environment due to CNN's additional frequency dimension which captures extra
+information from noisy speech, thus improving accuracy. We collected data from
+various open sources and labelled some of the unlabelled data after analysing
+its general context and content from Urdu language as well as from commonly
+used words from other languages, primarily English and were able to achieve WER
+of 5.2% with noisy as well as clean environment in isolated words or numbers as
+well as in continuous spontaneous speech.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 19 figures, 2 tables, preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Model for Every User and Budget: Label-Free and Personalized
+  Mixed-Precision Quantization <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12659v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12659v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edward Fish, Umberto Michieli, Mete Ozay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancement in Automatic Speech Recognition (ASR) has produced large
+AI models, which become impractical for deployment in mobile devices. Model
+quantization is effective to produce compressed general-purpose models, however
+such models may only be deployed to a restricted sub-domain of interest. We
+show that ASR models can be personalized during quantization while relying on
+just a small set of unlabelled samples from the target domain. To this end, we
+propose myQASR, a mixed-precision quantization method that generates tailored
+quantization schemes for diverse users under any memory requirement with no
+fine-tuning. myQASR automatically evaluates the quantization sensitivity of
+network layers by analysing the full-precision activation values. We are then
+able to generate a personalised mixed-precision quantization scheme for any
+pre-determined memory budget. Results for large-scale ASR models show how
+myQASR improves performance for specific genders, languages, and speakers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>INTERSPEECH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fake News Detection Through Graph-based Neural Networks: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12639v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12639v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuzhi Gong, Richard O. Sinnott, Jianzhong Qi, Cecile Paris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The popularity of online social networks has enabled rapid dissemination of
+information. People now can share and consume information much more rapidly
+than ever before. However, low-quality and/or accidentally/deliberately fake
+information can also spread rapidly. This can lead to considerable and negative
+impacts on society. Identifying, labelling and debunking online misinformation
+as early as possible has become an increasingly urgent problem. Many methods
+have been proposed to detect fake news including many deep learning and
+graph-based approaches. In recent years, graph-based methods have yielded
+strong results, as they can closely model the social context and propagation
+process of online news. In this paper, we present a systematic review of fake
+news detection studies based on graph-based and deep learning-based techniques.
+We classify existing graph-based methods into knowledge-driven methods,
+propagation-based methods, and heterogeneous social context-based methods,
+depending on how a graph structure is constructed to model news related
+information flows. We further discuss the challenges and open problems in
+graph-based fake news detection and identify future research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 3 tables, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tachikuma: Understading Complex Interactions with Multi-Character and
+  Novel Objects by Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12573v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12573v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanzhi Liang, Linchao Zhu, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in natural language and Large Language Models (LLMs) have
+enabled AI agents to simulate human-like interactions within virtual worlds.
+However, these interactions still face limitations in complexity and
+flexibility, particularly in scenarios involving multiple characters and novel
+objects. Pre-defining all interactable objects in the agent's world model
+presents challenges, and conveying implicit intentions to multiple characters
+through complex interactions remains difficult. To address these issues, we
+propose integrating virtual Game Masters (GMs) into the agent's world model,
+drawing inspiration from Tabletop Role-Playing Games (TRPGs). GMs play a
+crucial role in overseeing information, estimating players' intentions,
+providing environment descriptions, and offering feedback, compensating for
+current world model deficiencies. To facilitate future explorations for complex
+interactions, we introduce a benchmark named Tachikuma, comprising a Multiple
+character and novel Object based interaction Estimation (MOE) task and a
+supporting dataset. MOE challenges models to understand characters' intentions
+and accurately determine their actions within intricate contexts involving
+multi-character and novel object interactions. Besides, the dataset captures
+log data from real-time communications during gameplay, providing diverse,
+grounded, and complex interactions for further explorations. Finally, we
+present a simple prompting baseline and evaluate its performance, demonstrating
+its effectiveness in enhancing interaction understanding. We hope that our
+dataset and task will inspire further research in complex interactions with
+natural language, fostering the development of more advanced AI agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preliminary version of an ongoing work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generalising Neural Topical Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12564v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12564v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohao Yang, He Zhao, Dinh Phung, Lan Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Topic models have evolved from conventional Bayesian probabilistic models to
+Neural Topic Models (NTMs) over the last two decays. Although NTMs have
+achieved promising performance when trained and tested on a specific corpus,
+their generalisation ability across corpora is rarely studied. In practice, we
+often expect that an NTM trained on a source corpus can still produce quality
+topical representation for documents in a different target corpus without
+retraining. In this work, we aim to improve NTMs further so that their benefits
+generalise reliably across corpora and tasks. To do so, we propose to model
+similar documents by minimising their semantical distance when training NTMs.
+Specifically, similar documents are created by data augmentation during
+training; The semantical distance between documents is measured by the
+Hierarchical Topic Transport Distance (HOTT), which computes the Optimal
+Transport (OT) distance between the topical representations. Our framework can
+be readily applied to most NTMs as a plug-and-play module. Extensive
+experiments show that our framework significantly improves the generalisation
+ability regarding neural topical representation across corpora.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lost In Translation: Generating Adversarial Examples Robust to
+  Round-Trip Translation <span class="chip">ICASSP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neel Bhandari, Pin-Yu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language Models today provide a high accuracy across a large number of
+downstream tasks. However, they remain susceptible to adversarial attacks,
+particularly against those where the adversarial examples maintain considerable
+similarity to the original text. Given the multilingual nature of text, the
+effectiveness of adversarial examples across translations and how machine
+translations can improve the robustness of adversarial examples remain largely
+unexplored. In this paper, we present a comprehensive study on the robustness
+of current text adversarial attacks to round-trip translation. We demonstrate
+that 6 state-of-the-art text-based adversarial attacks do not maintain their
+efficacy after round-trip translation. Furthermore, we introduce an
+intervention-based solution to this problem, by integrating Machine Translation
+into the process of adversarial example generation and demonstrating increased
+robustness to round-trip translation. Our results indicate that finding
+adversarial examples robust to translation can help identify the insufficiency
+of language models that is common across languages, and motivate further
+research into multilingual adversarial attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at International Conference on Acoustics, Speech, and
+  Signal Processing (ICASSP) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Existence of "Secret Language'' in Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yimu Wang, Peng Shi, Hongyang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study the problem of secret language in NLP, where current
+language models (LMs) seem to have a hidden vocabulary that allows them to
+interpret absurd inputs as meaningful concepts. We investigate two research
+questions: ``Does the secret language phenomenon exist in different language
+models?'' and ``Does secret language depend on specific context?'' To answer
+these questions, we introduce a novel method named \textit{SecretFinding}, a
+gradient-based approach that can automatically discover secret languages in
+LMs. We conduct experiments on five representative models (Electra, ALBERT,
+Roberta, DistillBERT, and CLIP) finetuned on four NLP benchmarks (SST-2, MRPC,
+SNLI, and SQuAD) and a language-grounding benchmark (MSCOCO). Our experimental
+results show that even when we replace the most important words with others
+that are semantically dissimilar to the original words in a sentence, LMs do
+not consider the new sentence semantically dissimilar to the original, as the
+output does not change with a high probability. This phenomenon holds true
+across the five models and five tasks and gives a positive answer to the first
+research question. As for the second research question, we find that the secret
+language discovered by \textit{SecretFinding} is quite general and could even
+be transferred to other models in the black-box settings, such as GPT-3 and
+ChatGPT. Finally, we discuss the causes of secret language, how to eliminate
+it, the potential connection to memorization, and ethical implications.
+Examples of secret language found by SecretFinding are available on
+https://huggingface.co/spaces/anonymousauthors/ACL23_SecretLanguage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Automatic Speech Recognition via WavAugment Guided Phoneme
+  Adversarial Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12498v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12498v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gege Qi, Yuefeng Chen, Xiaofeng Mao, Xiaojun Jia, Ranjie Duan, Rong Zhang, Hui Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing a practically-robust automatic speech recognition (ASR) is
+challenging since the model should not only maintain the original performance
+on clean samples, but also achieve consistent efficacy under small volume
+perturbations and large domain shifts. To address this problem, we propose a
+novel WavAugment Guided Phoneme Adversarial Training (wapat). wapat use
+adversarial examples in phoneme space as augmentation to make the model
+invariant to minor fluctuations in phoneme representation and preserve the
+performance on clean samples. In addition, wapat utilizes the phoneme
+representation of augmented samples to guide the generation of adversaries,
+which helps to find more stable and diverse gradient-directions, resulting in
+improved generalization. Extensive experiments demonstrate the effectiveness of
+wapat on End-to-end Speech Challenge Benchmark (ESB). Notably, SpeechLM-wapat
+outperforms the original model by 6.28% WER reduction on ESB, achieving the new
+state-of-the-art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Schema-Driven Actionable Insight Generation and Smart Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13176v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13176v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allmin Susaiyah, Aki Härmä, Milan Petković
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In natural language generation (NLG), insight mining is seen as a
+data-to-text task, where data is mined for interesting patterns and verbalised
+into 'insight' statements. An 'over-generate and rank' paradigm is intuitively
+used to generate such insights. The multidimensionality and subjectivity of
+this process make it challenging. This paper introduces a schema-driven method
+to generate actionable insights from data to drive growth and change. It also
+introduces a technique to rank the insights to align with user interests based
+on their feedback. We show preliminary qualitative results of the insights
+generated using our technique and demonstrate its ability to adapt to feedback.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Opinion Mining Using Population-tuned Generative Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allmin Susaiyah, Abhinay Pandya, Aki Härmä
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel method for mining opinions from text collections using
+generative language models trained on data collected from different
+populations. We describe the basic definitions, methodology and a generic
+algorithm for opinion insight mining. We demonstrate the performance of our
+method in an experiment where a pre-trained generative model is fine-tuned
+using specifically tailored content with unnatural and fully annotated
+opinions. We show that our approach can learn and transfer the opinions to the
+semantic classes while maintaining the proportion of polarisation. Finally, we
+demonstrate the usage of an insight mining system to scale up the discovery of
+opinion insights from a real text corpus.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explaining Math Word Problem Solvers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13128v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13128v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abby Newcomb, Jugal Kalita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated math word problem solvers based on neural networks have
+successfully managed to obtain 70-80\% accuracy in solving arithmetic word
+problems. However, it has been shown that these solvers may rely on superficial
+patterns to obtain their equations. In order to determine what information math
+word problem solvers use to generate solutions, we remove parts of the input
+and measure the model's performance on the perturbed dataset. Our results show
+that the model is not sensitive to the removal of many words from the input and
+can still manage to find a correct answer when given a nonsense question. This
+indicates that automatic solvers do not follow the semantic logic of math word
+problems, and may be overfitting to the presence of specific words.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How to use LLMs for Text Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13106v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13106v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Petter Törnberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This guide introduces Large Language Models (LLM) as a highly versatile text
+analysis method within the social sciences. As LLMs are easy-to-use, cheap,
+fast, and applicable on a broad range of text analysis tasks, ranging from text
+annotation and classification to sentiment analysis and critical discourse
+analysis, many scholars believe that LLMs will transform how we do text
+analysis. This how-to guide is aimed at students and researchers with limited
+programming experience, and offers a simple introduction to how LLMs can be
+used for text analysis in your own research project, as well as advice on best
+practices. We will go through each of the steps of analyzing textual data with
+LLMs using Python: installing the software, setting up the API, loading the
+data, developing an analysis prompt, analyzing the text, and validating the
+results. As an illustrative example, we will use the challenging task of
+identifying populism in political texts, and show how LLMs move beyond the
+existing state-of-the-art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Making Metadata More FAIR Using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sowmya S. Sundaram, Mark A. Musen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the global increase in experimental data artifacts, harnessing them in a
+unified fashion leads to a major stumbling block - bad metadata. To bridge this
+gap, this work presents a Natural Language Processing (NLP) informed
+application, called FAIRMetaText, that compares metadata. Specifically,
+FAIRMetaText analyzes the natural language descriptions of metadata and
+provides a mathematical similarity measure between two terms. This measure can
+then be utilized for analyzing varied metadata, by suggesting terms for
+compliance or grouping similar terms for identification of replaceable terms.
+The efficacy of the algorithm is presented qualitatively and quantitatively on
+publicly available research artifacts and demonstrates large gains across
+metadata related tasks through an in-depth study of a wide variety of Large
+Language Models (LLMs). This software can drastically reduce the human effort
+in sifting through various natural language metadata while employing several
+experimental datasets on the same topic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The potential of LLMs for coding with low-resource and domain-specific
+  programming languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Artur Tarassow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a study on the feasibility of using large language models
+(LLM) for coding with low-resource and domain-specific programming languages
+that typically lack the amount of data required for effective LLM processing
+techniques. This study focuses on the econometric scripting language named
+hansl of the open-source software gretl and employs a proprietary LLM based on
+GPT-3.5. Our findings suggest that LLMs can be a useful tool for writing,
+understanding, improving, and documenting gretl code, which includes generating
+descriptive docstrings for functions and providing precise explanations for
+abstract and poorly documented econometric code. While the LLM showcased
+promoting docstring-to-code translation capability, we also identify some
+limitations, such as its inability to improve certain sections of code and to
+write accurate unit tests. This study is a step towards leveraging the power of
+LLMs to facilitate software development in low-resource programming languages
+and ultimately to lower barriers to entry for their adoption.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Hybrid Machine Learning Model for Classifying Gene Mutations in Cancer
+  using LSTM, BiLSTM, CNN, GRU, and GloVe 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.14361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.14361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanad Aburass, Osama Dorgham, Jamil Al Shaqsi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents an ensemble model combining LSTM, BiLSTM, CNN, GRU, and
+GloVe to classify gene mutations using Kaggle's Personalized Medicine:
+Redefining Cancer Treatment dataset. The results were compared against
+well-known transformers like as BERT, Electra, Roberta, XLNet, Distilbert, and
+their LSTM ensembles. Our model outperformed all other models in terms of
+accuracy, precision, recall, F1 score, and Mean Squared Error. Surprisingly, it
+also needed less training time, resulting in a perfect combination of
+performance and efficiency. This study demonstrates the utility of ensemble
+models for difficult tasks such as gene mutation classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 7 figures and 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Do <span class="highlight-title">Transformer</span>s Learn Topic Structure: Towards a Mechanistic
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04245v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04245v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Li, Yuanzhi Li, Andrej Risteski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the successes of transformers across many domains are indisputable,
+accurate understanding of the learning mechanics is still largely lacking.
+Their capabilities have been probed on benchmarks which include a variety of
+structured and reasoning tasks -- but mathematical understanding is lagging
+substantially behind. Recent lines of work have begun studying representational
+aspects of this question: that is, the size/depth/complexity of attention-based
+networks to perform certain tasks. However, there is no guarantee the learning
+dynamics will converge to the constructions proposed. In our paper, we provide
+fine-grained mechanistic understanding of how transformers learn "semantic
+structure", understood as capturing co-occurrence structure of words.
+Precisely, we show, through a combination of mathematical analysis and
+experiments on Wikipedia data and synthetic data modeled by Latent Dirichlet
+Allocation (LDA), that the embedding layer and the self-attention layer encode
+the topical structure. In the former case, this manifests as higher average
+inner product of embeddings between same-topic words. In the latter, it
+manifests as higher average pairwise attention between same-topic words. The
+mathematical results involve several assumptions to make the analysis
+tractable, which we verify on data, and might be of independent interest as
+well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Classification of US Supreme Court Cases using <span class="highlight-title">BERT</span>-Based Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08649v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08649v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Vatsal, Adam Meyers, John E. Ortega
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models based on bidirectional encoder representations from transformers
+(BERT) produce state of the art (SOTA) results on many natural language
+processing (NLP) tasks such as named entity recognition (NER), part-of-speech
+(POS) tagging etc. An interesting phenomenon occurs when classifying long
+documents such as those from the US supreme court where BERT-based models can
+be considered difficult to use on a first-pass or out-of-the-box basis. In this
+paper, we experiment with several BERT-based classification techniques for US
+supreme court decisions or supreme court database (SCDB) and compare them with
+the previous SOTA results. We then compare our results specifically with SOTA
+models for long documents. We compare our results for two classification tasks:
+(1) a broad classification task with 15 categories and (2) a fine-grained
+classification task with 279 categories. Our best result produces an accuracy
+of 80\% on the 15 broad categories and 60\% on the fine-grained 279 categories
+which marks an improvement of 8\% and 28\% respectively from previously
+reported SOTA results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ (Ab)using Images and Sounds for Indirect Instruction Injection in
+  Multi-Modal LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10490v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10490v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eugene Bagdasaryan, Tsung-Yin Hsieh, Ben Nassi, Vitaly Shmatikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate how images and sounds can be used for indirect prompt and
+instruction injection in multi-modal LLMs. An attacker generates an adversarial
+perturbation corresponding to the prompt and blends it into an image or audio
+recording. When the user asks the (unmodified, benign) model about the
+perturbed image or audio, the perturbation steers the model to output the
+attacker-chosen text and/or make the subsequent dialog follow the attacker's
+instruction. We illustrate this attack with several proof-of-concept examples
+targeting LLaVa and PandaGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ XTQA: Span-Level Explanations of the Textbook Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2011.12662v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2011.12662v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Ma, Qi Chai, Jun Liu, Qingyu Yin, Pinghui Wang, Qinghua Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Textbook Question Answering (TQA) is a task that one should answer a
+diagram/non-diagram question given a large multi-modal context consisting of
+abundant essays and diagrams. We argue that the explainability of this task
+should place students as a key aspect to be considered. To address this issue,
+we devise a novel architecture towards span-level eXplanations of the TQA
+(XTQA) based on our proposed coarse-to-fine grained algorithm, which can
+provide not only the answers but also the span-level evidences to choose them
+for students. This algorithm first coarsely chooses top $M$ paragraphs relevant
+to questions using the TF-IDF method, and then chooses top $K$ evidence spans
+finely from all candidate spans within these paragraphs by computing the
+information gain of each span to questions. Experimental results shows that
+XTQA significantly improves the state-of-the-art performance compared with
+baselines. The source code is available at
+https://github.com/keep-smile-001/opentqa
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE TNNLS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Emotion Experiencer Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16731v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16731v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Wegge, Roman Klinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The most prominent subtask in emotion analysis is emotion classification; to
+assign a category to a textual unit, for instance a social media post. Many
+research questions from the social sciences do, however, not only require the
+detection of the emotion of an author of a post but to understand who is
+ascribed an emotion in text. This task is tackled by emotion role labeling
+which aims at extracting who is described in text to experience an emotion,
+why, and towards whom. This could, however, be considered overly sophisticated
+if the main question to answer is who feels which emotion. A targeted approach
+for such setup is to classify emotion experiencer mentions (aka "emoters")
+regarding the emotion they presumably perceive. This task is similar to named
+entity recognition of person names with the difference that not every mentioned
+entity name is an emoter. While, very recently, data with emoter annotations
+has been made available, no experiments have yet been performed to detect such
+mentions. With this paper, we provide baseline experiments to understand how
+challenging the task is. We further evaluate the impact on experiencer-specific
+emotion categorization and appraisal detection in a pipeline, when gold
+mentions are not available. We show that experiencer detection in text is a
+challenging task, with a precision of .82 and a recall of .56 (F1 =.66). These
+results motivate future work of jointly modeling emoter spans and
+emotion/appraisal predictions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to the CPSS workshop at KONVENS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Next Chapter: A Study of Large Language Models in Storytelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.09790v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.09790v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuohan Xie, Trevor Cohn, Jey Han Lau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To enhance the quality of generated stories, recent story generation models
+have been investigating the utilization of higher-level attributes like plots
+or commonsense knowledge. The application of prompt-based learning with large
+language models (LLMs), exemplified by GPT-3, has exhibited remarkable
+performance in diverse natural language processing (NLP) tasks. This paper
+conducts a comprehensive investigation, utilizing both automatic and human
+evaluation, to compare the story generation capacity of LLMs with recent models
+across three datasets with variations in style, register, and length of
+stories. The results demonstrate that LLMs generate stories of significantly
+higher quality compared to other story generation models. Moreover, they
+exhibit a level of performance that competes with human authors, albeit with
+the preliminary observation that they tend to replicate real stories in
+situations involving world knowledge, resembling a form of plagiarism.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INLG2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards autonomous system: flexible modular production system enhanced
+  with large language model agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14721v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14721v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Xia, Manthan Shenoy, Nasser Jazdi, Michael Weyrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a novel framework that combines large language
+models (LLMs), digital twins and industrial automation system to enable
+intelligent planning and control of production processes. We retrofit the
+automation system for a modular production facility and create executable
+control interfaces of fine-granular functionalities and coarse-granular skills.
+Low-level functionalities are executed by automation components, and high-level
+skills are performed by automation modules. Subsequently, a digital twin system
+is developed, registering these interfaces and containing additional
+descriptive information about the production system. Based on the retrofitted
+automation system and the created digital twins, LLM-agents are designed to
+interpret descriptive information in the digital twins and control the physical
+system through service interfaces. These LLM-agents serve as intelligent agents
+on different levels within an automation system, enabling autonomous planning
+and control of flexible production. Given a task instruction as input, the
+LLM-agents orchestrate a sequence of atomic functionalities and skills to
+accomplish the task. We demonstrate how our implemented prototype can handle
+un-predefined tasks, plan a production process, and execute the operations.
+This research highlights the potential of integrating LLMs into industrial
+automation systems in the context of smart factory for more agile, flexible,
+and adaptive production processes, while it also underscores the critical
+insights and limitations for future work. Demos at:
+https://github.com/YuchenXia/GPT4IndustrialAutomation
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the pre-print draft manuscript. The peer-reviewed version
+  will be published exclusively by IEEE after the conference, which is set to
+  take place from September 12th to 15th, 2023. We've made several improvements
+  to the final version of the paper based on valuable feedback and suggestions
+  from other researchers</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning "O" Helps for Learning More: Handling the Concealed Entity
+  Problem for Class-incremental NER <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.04676v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.04676v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruotian Ma, Xuanting Chen, Lin Zhang, Xin Zhou, Junzhe Wang, Tao Gui, Qi Zhang, Xiang Gao, Yunwen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the categories of named entities rapidly increase, the deployed NER models
+are required to keep updating toward recognizing more entity types, creating a
+demand for class-incremental learning for NER. Considering the privacy concerns
+and storage constraints, the standard paradigm for class-incremental NER
+updates the models with training data only annotated with the new classes, yet
+the entities from other entity classes are unlabeled, regarded as "Non-entity"
+(or "O"). In this work, we conduct an empirical study on the "Unlabeled Entity
+Problem" and find that it leads to severe confusion between "O" and entities,
+decreasing class discrimination of old classes and declining the model's
+ability to learn new classes. To solve the Unlabeled Entity Problem, we propose
+a novel representation learning method to learn discriminative representations
+for the entity classes and "O". Specifically, we propose an entity-aware
+contrastive learning method that adaptively detects entity clusters in "O".
+Furthermore, we propose two effective distance-based relabeling strategies for
+better learning the old classes. We introduce a more realistic and challenging
+benchmark for class-incremental NER, and the proposed method achieves up to
+10.62\% improvement over the baseline methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Chat<span class="highlight-title">GPT</span> a Biomedical Expert? -- Exploring the Zero-Shot Performance
+  of Current <span class="highlight-title">GPT</span> Models in Biomedical Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16108v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16108v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samy Ateia, Udo Kruschwitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We assessed the performance of commercial Large Language Models (LLMs)
+GPT-3.5-Turbo and GPT-4 on tasks from the 2023 BioASQ challenge. In Task 11b
+Phase B, which is focused on answer generation, both models demonstrated
+competitive abilities with leading systems. Remarkably, they achieved this with
+simple zero-shot learning, grounded with relevant snippets. Even without
+relevant snippets, their performance was decent, though not on par with the
+best systems. Interestingly, the older and cheaper GPT-3.5-Turbo system was
+able to compete with GPT-4 in the grounded Q&A setting on factoid and list
+answers. In Task 11b Phase A, focusing on retrieval, query expansion through
+zero-shot learning improved performance, but the models fell short compared to
+other systems. The code needed to rerun these experiments is available through
+GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint accepted at the 11th BioASQ Workshop at the 14th Conference
+  and Labs of the Evaluation Forum (CLEF) 2023; Changes: 1. Added related work
+  and experimental setup sections. 2. Reworked discussion and future work
+  section. 3. Fixed multiple typos and improved style. Changed license</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SparseGAN: Sparse Generative Adversarial Network for Text Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.11578v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.11578v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liping Yuan, Jiehang Zeng, Xiaoqing Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is still a challenging task to learn a neural text generation model under
+the framework of generative adversarial networks (GANs) since the entire
+training process is not differentiable. The existing training strategies either
+suffer from unreliable gradient estimations or imprecise sentence
+representations. Inspired by the principle of sparse coding, we propose a
+SparseGAN that generates semantic-interpretable, but sparse sentence
+representations as inputs to the discriminator. The key idea is that we treat
+an embedding matrix as an over-complete dictionary, and use a linear
+combination of very few selected word embeddings to approximate the output
+feature representation of the generator at each time step. With such
+semantic-rich representations, we not only reduce unnecessary noises for
+efficient adversarial training, but also make the entire training process fully
+differentiable. Experiments on multiple text generation datasets yield
+performance improvements, especially in sequence-level metrics, such as BLEU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Style Classification of Rabbinic Literature for Detection of Lost
+  Midrash Tanhuma Material 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.09710v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.09710v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shlomo Tannor, Nachum Dershowitz, Moshe Lavee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Midrash collections are complex rabbinic works that consist of text in
+multiple languages, which evolved through long processes of unstable oral and
+written transmission. Determining the origin of a given passage in such a
+compilation is not always straightforward and is often a matter of dispute
+among scholars, yet it is essential for scholars' understanding of the passage
+and its relationship to other texts in the rabbinic corpus. To help solve this
+problem, we propose a system for classification of rabbinic literature based on
+its style, leveraging recent advances in natural language processing for Hebrew
+texts. Additionally, we demonstrate how this method can be applied to uncover
+lost material from a specific midrash genre, Tan\d{h}uma-Yelammedenu, that has
+been preserved in later anthologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Coreference Resolution by Leveraging Entity-Centric Features
+  with Graph Neural Networks and Second-order Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2009.04639v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2009.04639v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lu Liu, Zhenqiao Song, Xiaoqing Zheng, Jun He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the major challenges in coreference resolution is how to make use of
+entity-level features defined over clusters of mentions rather than mention
+pairs. However, coreferent mentions usually spread far apart in an entire text,
+which makes it extremely difficult to incorporate entity-level features. We
+propose a graph neural network-based coreference resolution method that can
+capture the entity-centric information by encouraging the sharing of features
+across all mentions that probably refer to the same real-world entity. Mentions
+are linked to each other via the edges modeling how likely two linked mentions
+point to the same entity. Modeling by such graphs, the features between
+mentions can be shared by message passing operations in an entity-centric
+manner. A global inference algorithm up to second-order features is also
+presented to optimally cluster mentions into consistent groups. Experimental
+results show our graph neural network-based method combing with the
+second-order decoding algorithm (named GNNCR) achieved close to
+state-of-the-art performance on the English CoNLL-2012 Shared Task dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpokenWOZ: A Large-Scale Speech-Text Benchmark for Spoken Task-Oriented
+  Dialogue Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13040v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13040v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuzheng Si, Wentao Ma, Haoyu Gao, Yuchuan Wu, Ting-En Lin, Yinpei Dai, Hangyu Li, Rui Yan, Fei Huang, Yongbin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Task-oriented dialogue (TOD) models have made significant progress in recent
+years. However, previous studies primarily focus on datasets written by
+annotators, which has resulted in a gap between academic research and
+real-world spoken conversation scenarios. While several small-scale spoken TOD
+datasets are proposed to address robustness issues such as ASR errors, they
+ignore the unique challenges in spoken conversation. To tackle the limitations,
+we introduce SpokenWOZ, a large-scale speech-text dataset for spoken TOD,
+containing 8 domains, 203k turns, 5.7k dialogues and 249 hours of audios from
+human-to-human spoken conversations. SpokenWOZ further incorporates common
+spoken characteristics such as word-by-word processing and reasoning in spoken
+language. Based on these characteristics, we present cross-turn slot and
+reasoning slot detection as new challenges. We conduct experiments on various
+baselines, including text-modal models, newly proposed dual-modal models, and
+LLMs, e.g., ChatGPT. The results show that the current models still have
+substantial room for improvement in spoken conversation, where the most
+advanced dialogue state tracker only achieves 25.65% in joint goal accuracy and
+the SOTA end-to-end model only correctly completes the user request in 52.1% of
+dialogues. The dataset, code, and leaderboard are available:
+https://spokenwoz.github.io/SpokenWOZ-github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Summarization by Jointly Extracting Sentences and Keywords 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2009.07481v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2009.07481v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongyi Li, Xiaoqing Zheng, Jun He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present RepRank, an unsupervised graph-based ranking model for extractive
+multi-document summarization in which the similarity between words, sentences,
+and word-to-sentence can be estimated by the distances between their vector
+representations in a unified vector space. In order to obtain desirable
+representations, we propose a self-attention based learning method that
+represent a sentence by the weighted sum of its word embeddings, and the
+weights are concentrated to those words hopefully better reflecting the content
+of a document. We show that salient sentences and keywords can be extracted in
+a joint and mutual reinforcement process using our learned representations, and
+prove that this process always converges to a unique solution leading to
+improvement in performance. A variant of absorbing random walk and the
+corresponding sampling-based algorithm are also described to avoid redundancy
+and increase diversity in the summaries. Experiment results with multiple
+benchmark datasets show that RepRank achieved the best or comparable
+performance in ROUGE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages(includes 2 pages references), 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CausE: Towards Causal Knowledge Graph Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11610v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11610v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichi Zhang, Wen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graph embedding (KGE) focuses on representing the entities and
+relations of a knowledge graph (KG) into the continuous vector spaces, which
+can be employed to predict the missing triples to achieve knowledge graph
+completion (KGC). However, KGE models often only briefly learn structural
+correlations of triple data and embeddings would be misled by the trivial
+patterns and noisy links in real-world KGs. To address this issue, we build the
+new paradigm of KGE in the context of causality and embedding disentanglement.
+We further propose a Causality-enhanced knowledge graph Embedding (CausE)
+framework. CausE employs causal intervention to estimate the causal effect of
+the confounder embeddings and design new training objectives to make stable
+predictions. Experimental results demonstrate that CausE could outperform the
+baseline models and achieve state-of-the-art KGC performance. We release our
+code in https://github.com/zjukg/CausE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CCKS 2023 as a research paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chinese Fine-Grained Financial Sentiment Analysis with Large Language
+  Models <span class="chip">IJCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14096v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14096v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinyu Lan, Yanru Wu, Wang Xu, Weiqiang Feng, Youhao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entity-level fine-grained sentiment analysis in the financial domain is a
+crucial subtask of sentiment analysis and currently faces numerous challenges.
+The primary challenge stems from the lack of high-quality and large-scale
+annotated corpora specifically designed for financial text sentiment analysis,
+which in turn limits the availability of data necessary for developing
+effective text processing techniques. Recent advancements in large language
+models (LLMs) have yielded remarkable performance in natural language
+processing tasks, primarily centered around language pattern matching. In this
+paper, we propose a novel and extensive Chinese fine-grained financial
+sentiment analysis dataset, FinChina SA, for enterprise early warning. We
+thoroughly evaluate and experiment with well-known existing open-source LLMs
+using our dataset. We firmly believe that our dataset will serve as a valuable
+resource to advance the exploration of real-world financial sentiment analysis
+tasks, which should be the focus of future research. The FinChina SA dataset is
+publicly available at https://github.com/YerayL/FinChina-SA
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>FinLLM Symposium at IJCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vision Meets Definitions: Unsupervised Visual Word Sense Disambiguation
+  Incorporating Gloss Information <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01788v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01788v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sunjae Kwon, Rishabh Garodia, Minhwa Lee, Zhichao Yang, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Word Sense Disambiguation (VWSD) is a task to find the image that most
+accurately depicts the correct sense of the target word for the given context.
+Previously, image-text matching models often suffered from recognizing
+polysemous words. This paper introduces an unsupervised VWSD approach that uses
+gloss information of an external lexical knowledge-base, especially the sense
+definitions. Specifically, we suggest employing Bayesian inference to
+incorporate the sense definitions when sense information of the answer is not
+provided. In addition, to ameliorate the out-of-dictionary (OOD) issue, we
+propose a context-aware definition generation with GPT-3. Experimental results
+show that the VWSD performance significantly increased with our Bayesian
+inference-based approach. In addition, our context-aware definition generation
+achieved prominent performance improvement in OOD examples exhibiting better
+performance than the existing definition generation method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, https://aclanthology.org/2023.acl-long.88</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ODD: A Benchmark <span class="highlight-title">Dataset</span> for the NLP-based Opioid Related Aberrant
+  Behavior Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02591v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02591v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sunjae Kwon, Xun Wang, Weisong Liu, Emily Druhl, Minhee L. Sung, Joel I. Reisman, Wenjun Li, Robert D. Kerns, William Becker, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Opioid related aberrant behaviors (ORAB) present novel risk factors for
+opioid overdose. Previously, ORAB have been mainly assessed by survey results
+and by monitoring drug administrations. Such methods however, cannot scale up
+and do not cover the entire spectrum of aberrant behaviors. On the other hand,
+ORAB are widely documented in electronic health record notes. This paper
+introduces a novel biomedical natural language processing benchmark dataset
+named ODD, for ORAB Detection Dataset. ODD is an expert-annotated dataset
+comprising of more than 750 publicly available EHR notes. ODD has been designed
+to identify ORAB from patients' EHR notes and classify them into nine
+categories; 1) Confirmed Aberrant Behavior, 2) Suggested Aberrant Behavior, 3)
+Opioids, 4) Indication, 5) Diagnosed opioid dependency, 6) Benzodiapines, 7)
+Medication Changes, 8) Central Nervous System-related, and 9) Social
+Determinants of Health. We explored two state-of-the-art natural language
+processing (NLP) models (finetuning pretrained language models and
+prompt-tuning approaches) to identify ORAB. Experimental results show that the
+prompt-tuning models outperformed the finetuning models in most cateogories and
+the gains were especially higher among uncommon categories (Suggested aberrant
+behavior, Diagnosed opioid dependency and Medication change). Although the best
+model achieved the highest 83.92% on area under precision recall curve,
+uncommon classes (Suggested Aberrant Behavior, Diagnosed Opioid Dependence, and
+Medication Change) still have a large room for performance improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Classification of Consumer Belief Statements From Social Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.15498v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.15498v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, Wenbin Le, Hannah Danner, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media offer plenty of information to perform market research in order
+to meet the requirements of customers. One way how this research is conducted
+is that a domain expert gathers and categorizes user-generated content into a
+complex and fine-grained class structure. In many of such cases, little data
+meets complex annotations. It is not yet fully understood how this can be
+leveraged successfully for classification. We examine the classification
+accuracy of expert labels when used with a) many fine-grained classes and b)
+few abstract classes. For scenario b) we compare abstract class labels given by
+the domain expert as baseline and by automatic hierarchical clustering. We
+compare this to another baseline where the entire class structure is given by a
+completely unsupervised clustering approach. By doing so, this work can serve
+as an example of how complex expert annotations are potentially beneficial and
+can be utilized in the most optimal way for opinion mining in highly specific
+domains. By exploring across a range of techniques and experiments, we find
+that automated class abstraction approaches in particular the unsupervised
+approach performs remarkably well against domain expert baseline on text
+classification tasks. This has the potential to inspire opinion mining
+applications in order to support market researchers in practice and to inspire
+fine-grained automated content analysis on a large scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SocialVisTUM: An Interactive Visualization Toolkit for Correlated Neural
+  Topic Models on Social Media Opinion Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.10575v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.10575v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, Martin Kirchhoff, Hannah Danner, Robert Pesch, Mainak Ghosh, Archishman Roy, Jiaxi Zhao, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research in opinion mining proposed word embedding-based topic
+modeling methods that provide superior coherence compared to traditional topic
+modeling. In this paper, we demonstrate how these methods can be used to
+display correlated topic models on social media texts using SocialVisTUM, our
+proposed interactive visualization toolkit. It displays a graph with topics as
+nodes and their correlations as edges. Further details are displayed
+interactively to support the exploration of large text collections, e.g.,
+representative words and sentences of topics, topic and sentiment
+distributions, hierarchical topic clustering, and customizable, predefined
+topic labels. The toolkit optimizes automatically on custom data for optimal
+coherence. We show a working instance of the toolkit on data crawled from
+English social media discussions about organic food consumption. The
+visualization confirms findings of a qualitative consumer research study.
+SocialVisTUM and its training procedures are accessible online.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Demo paper accepted for publication on RANLP 2021; 8 pages, 5
+  figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Analysis of Programming Course Evaluations Before and After the
+  Introduction of an Autograder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.15134v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.15134v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, Laura Lahesoo, Miriam Anschütz, Stephan Krusche, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Commonly, introductory programming courses in higher education institutions
+have hundreds of participating students eager to learn to program. The manual
+effort for reviewing the submitted source code and for providing feedback can
+no longer be managed. Manually reviewing the submitted homework can be
+subjective and unfair, particularly if many tutors are responsible for grading.
+Different autograders can help in this situation; however, there is a lack of
+knowledge about how autograders can impact students' overall perception of
+programming classes and teaching. This is relevant for course organizers and
+institutions to keep their programming courses attractive while coping with
+increasing students.
+  This paper studies the answers to the standardized university evaluation
+questionnaires of multiple large-scale foundational computer science courses
+which recently introduced autograding. The differences before and after this
+intervention are analyzed. By incorporating additional observations, we
+hypothesize how the autograder might have contributed to the significant
+changes in the data, such as, improved interactions between tutors and
+students, improved overall course quality, improved learning success, increased
+time spent, and reduced difficulty. This qualitative study aims to provide
+hypotheses for future research to define and conduct quantitative surveys and
+data analysis. The autograder technology can be validated as a teaching method
+to improve student satisfaction with programming courses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted full paper article on IEEE ITHET 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Case Study and Qualitative Analysis of Simple Cross-Lingual Opinion
+  Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.02259v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.02259v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, Wing Sheung Leung, Qiaoxi Liu, Hannah Danner, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  User-generated content from social media is produced in many languages,
+making it technically challenging to compare the discussed themes from one
+domain across different cultures and regions. It is relevant for domains in a
+globalized world, such as market research, where people from two nations and
+markets might have different requirements for a product. We propose a simple,
+modern, and effective method for building a single topic model with sentiment
+analysis capable of covering multiple languages simultanteously, based on a
+pre-trained state-of-the-art deep neural network for natural language
+understanding. To demonstrate its feasibility, we apply the model to newspaper
+articles and user comments of a specific domain, i.e., organic food products
+and related consumption behavior. The themes match across languages.
+Additionally, we obtain an high proportion of stable and domain-relevant
+topics, a meaningful relation between topics and their respective textual
+contents, and an interpretable representation for social media documents.
+Marketing can potentially benefit from our method, since it provides an
+easy-to-use means of addressing specific customer interests from different
+market regions around the globe. For reproducibility, we provide the code,
+data, and results of our study.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 tables, 5 figures, full paper, peer-reviewed, published
+  at KDIR/IC3k 2021 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ End-to-End Annotator Bias Approximation on Crowdsourced Single-Label
+  Sentiment Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.02326v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.02326v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, David Szabo, Andreas Koch, Maria Luisa Ripoll Dominguez, Christian Widmer, Maximilian Wich, Hannah Danner, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sentiment analysis is often a crowdsourcing task prone to subjective labels
+given by many annotators. It is not yet fully understood how the annotation
+bias of each annotator can be modeled correctly with state-of-the-art methods.
+However, resolving annotator bias precisely and reliably is the key to
+understand annotators' labeling behavior and to successfully resolve
+corresponding individual misconceptions and wrongdoings regarding the
+annotation task. Our contribution is an explanation and improvement for precise
+neural end-to-end bias modeling and ground truth estimation, which reduces an
+undesired mismatch in that regard of the existing state-of-the-art.
+Classification experiments show that it has potential to improve accuracy in
+cases where each sample is annotated only by one single annotator. We provide
+the whole source code publicly and release an own domain-specific sentiment
+dataset containing 10,000 sentences discussing organic food products. These are
+crawled from social media and are singly labeled by 10 non-expert annotators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, 2 tables, full conference paper, peer-reviewed</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NormBank: A Knowledge Bank of Situational Social Norms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17008v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17008v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Caleb Ziems, Jane Dwivedi-Yu, Yi-Chia Wang, Alon Halevy, Diyi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present NormBank, a knowledge bank of 155k situational norms. This
+resource is designed to ground flexible normative reasoning for interactive,
+assistive, and collaborative AI systems. Unlike prior commonsense resources,
+NormBank grounds each inference within a multivalent sociocultural frame, which
+includes the setting (e.g., restaurant), the agents' contingent roles (waiter,
+customer), their attributes (age, gender), and other physical, social, and
+cultural constraints (e.g., the temperature or the country of operation). In
+total, NormBank contains 63k unique constraints from a taxonomy that we
+introduce and iteratively refine here. Constraints then apply in different
+combinations to frame social norms. Under these manipulations, norms are
+non-monotonic - one can cancel an inference by updating its frame even
+slightly. Still, we find evidence that neural models can help reliably extend
+the scope and coverage of NormBank. We further demonstrate the utility of this
+resource with a series of transfer experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Explainable and Language-Agnostic LLMs: Symbolic Reverse
+  Engineering of Language at Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00017v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00017v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Walid S. Saba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved a milestone that undenia-bly
+changed many held beliefs in artificial intelligence (AI). However, there
+remains many limitations of these LLMs when it comes to true language
+understanding, limitations that are a byproduct of the under-lying architecture
+of deep neural networks. Moreover, and due to their subsymbolic nature,
+whatever knowledge these models acquire about how language works will always be
+buried in billions of microfeatures (weights), none of which is meaningful on
+its own, making such models hopelessly unexplainable. To address these
+limitations, we suggest com-bining the strength of symbolic representations
+with what we believe to be the key to the success of LLMs, namely a successful
+bottom-up re-verse engineering of language at scale. As such we argue for a
+bottom-up reverse engineering of language in a symbolic setting. Hints on what
+this project amounts to have been suggested by several authors, and we discuss
+in some detail here how this project could be accomplished.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Draft, preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">117</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D-LLM: Injecting the 3D World into Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yining Hong, Haoyu Zhen, Peihao Chen, Shuhong Zheng, Yilun Du, Zhenfang Chen, Chuang Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) and Vision-Language Models (VLMs) have been
+proven to excel at multiple tasks, such as commonsense reasoning. Powerful as
+these models can be, they are not grounded in the 3D physical world, which
+involves richer concepts such as spatial relationships, affordances, physics,
+layout, and so on. In this work, we propose to inject the 3D world into large
+language models and introduce a whole new family of 3D-LLMs. Specifically,
+3D-LLMs can take 3D point clouds and their features as input and perform a
+diverse set of 3D-related tasks, including captioning, dense captioning, 3D
+question answering, task decomposition, 3D grounding, 3D-assisted dialog,
+navigation, and so on. Using three types of prompting mechanisms that we
+design, we are able to collect over 300k 3D-language data covering these tasks.
+To efficiently train 3D-LLMs, we first utilize a 3D feature extractor that
+obtains 3D features from rendered multi- view images. Then, we use 2D VLMs as
+our backbones to train our 3D-LLMs. By introducing a 3D localization mechanism,
+3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show
+that our model outperforms state-of-the-art baselines by a large margin (e.g.,
+the BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore,
+experiments on our held-in datasets for 3D captioning, task composition, and
+3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative
+examples also show that our model could perform more tasks beyond the scope of
+existing LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: : https://vis-www.cs.umass.edu/3dllm/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Systematic <span class="highlight-title">Survey</span> of <span class="highlight-title">Prompt</span> Engineering on Vision-Language Foundation
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12980v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12980v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jindong Gu, Zhen Han, Shuo Chen, Ahmad Beirami, Bailan He, Gengyuan Zhang, Ruotong Liao, Yao Qin, Volker Tresp, Philip Torr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt engineering is a technique that involves augmenting a large
+pre-trained model with task-specific hints, known as prompts, to adapt the
+model to new tasks. Prompts can be created manually as natural language
+instructions or generated automatically as either natural language instructions
+or vector representations. Prompt engineering enables the ability to perform
+predictions based solely on prompts without updating model parameters, and the
+easier application of large pre-trained models in real-world tasks. In past
+years, Prompt engineering has been well-studied in natural language processing.
+Recently, it has also been intensively studied in vision-language modeling.
+However, there is currently a lack of a systematic overview of prompt
+engineering on pre-trained vision-language models. This paper aims to provide a
+comprehensive survey of cutting-edge research in prompt engineering on three
+types of vision-language models: multimodal-to-text generation models (e.g.
+Flamingo), image-text matching models (e.g. CLIP), and text-to-image generation
+models (e.g. Stable Diffusion). For each type of model, a brief model summary,
+prompting methods, prompting-based applications, and the corresponding
+responsibility and integrity issues are summarized and discussed. Furthermore,
+the commonalities and differences between prompting on vision-language models,
+language models, and vision models are also discussed. The challenges, future
+directions, and research opportunities are summarized to foster future research
+on this topic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DFA3D: 3D Deformable Attention For 2D-to-3D Feature Lifting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyang Li, Hao Zhang, Zhaoyang Zeng, Shilong Liu, Feng Li, Tianhe Ren, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a new operator, called 3D DeFormable Attention
+(DFA3D), for 2D-to-3D feature lifting, which transforms multi-view 2D image
+features into a unified 3D space for 3D object detection. Existing feature
+lifting approaches, such as Lift-Splat-based and 2D attention-based, either use
+estimated depth to get pseudo LiDAR features and then splat them to a 3D space,
+which is a one-pass operation without feature refinement, or ignore depth and
+lift features by 2D attention mechanisms, which achieve finer semantics while
+suffering from a depth ambiguity problem. In contrast, our DFA3D-based method
+first leverages the estimated depth to expand each view's 2D feature map to 3D
+and then utilizes DFA3D to aggregate features from the expanded 3D feature
+maps. With the help of DFA3D, the depth ambiguity problem can be effectively
+alleviated from the root, and the lifted features can be progressively refined
+layer by layer, thanks to the Transformer-like architecture. In addition, we
+propose a mathematically equivalent implementation of DFA3D which can
+significantly improve its memory efficiency and computational speed. We
+integrate DFA3D into several methods that use 2D attention-based feature
+lifting with only a few modifications in code and evaluate on the nuScenes
+dataset. The experiment results show a consistent improvement of +1.41\% mAP on
+average, and up to +15.1\% mAP improvement when high-quality depth information
+is available, demonstrating the superiority, applicability, and huge potential
+of DFA3D. The code is available at
+https://github.com/IDEA-Research/3D-deformable-attention.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Volcanic ash delimitation using Artificial Intelligence based on Pix2Pix 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Carrillo, Gissela Torres, Christian Mejia-Escobar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Volcanic eruptions emit ash that can be harmful to human health and cause
+damage to infrastructure, economic activities and the environment. The
+delimitation of ash clouds allows to know their behavior and dispersion, which
+helps in the prevention and mitigation of this phenomenon. Traditional methods
+take advantage of specialized software programs to process the bands or
+channels that compose the satellite images. However, their use is limited to
+experts and demands a lot of time and significant computational resources. In
+recent years, Artificial Intelligence has been a milestone in the computational
+treatment of complex problems in different areas. In particular, Deep Learning
+techniques allow automatic, fast and accurate processing of digital images. The
+present work proposes the use of the Pix2Pix model, a type of generative
+adversarial network that, once trained, learns the mapping of input images to
+output images. The architecture of such a network consisting of a generator and
+a discriminator provides the versatility needed to produce black and white ash
+cloud images from multispectral satellite images. The evaluation of the model,
+based on loss and accuracy plots, a confusion matrix, and visual inspection,
+indicates a satisfactory solution for accurate ash cloud delineation,
+applicable in any area of the world and becomes a useful tool in risk
+management.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, in Spanish language, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Dense Correspondences between Photos and Sketches <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanchen Lu, Xiaolong Wang, Judith E Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans effortlessly grasp the connection between sketches and real-world
+objects, even when these sketches are far from realistic. Moreover, human
+sketch understanding goes beyond categorization -- critically, it also entails
+understanding how individual elements within a sketch correspond to parts of
+the physical world it represents. What are the computational ingredients needed
+to support this ability? Towards answering this question, we make two
+contributions: first, we introduce a new sketch-photo correspondence benchmark,
+$\textit{PSC6k}$, containing 150K annotations of 6250 sketch-photo pairs across
+125 object categories, augmenting the existing Sketchy dataset with
+fine-grained correspondence metadata. Second, we propose a self-supervised
+method for learning dense correspondences between sketch-photo pairs, building
+upon recent advances in correspondence learning for pairs of photos. Our model
+uses a spatial transformer network to estimate the warp flow between latent
+representations of a sketch and photo extracted by a contrastive learning-based
+ConvNet backbone. We found that this approach outperformed several strong
+baselines and produced predictions that were quantitatively consistent with
+other warp-based methods. However, our benchmark also revealed systematic
+differences between predictions of the suite of models we tested and those of
+humans. Taken together, our work suggests a promising path towards developing
+artificial systems that achieve more human-like understanding of visual images
+at different levels of abstraction. Project page:
+https://photo-sketch-correspondence.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2023. Project page:
+  https://photo-sketch-correspondence.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio-Enhanced Text-to-Video Retrieval using Text-Conditioned Feature
+  Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12964v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12964v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sarah Ibrahimi, Xiaohang Sun, Pichao Wang, Amanmeet Garg, Ashutosh Sanan, Mohamed Omar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-video retrieval systems have recently made significant progress by
+utilizing pre-trained models trained on large-scale image-text pairs. However,
+most of the latest methods primarily focus on the video modality while
+disregarding the audio signal for this task. Nevertheless, a recent advancement
+by ECLIPSE has improved long-range text-to-video retrieval by developing an
+audiovisual video representation. Nonetheless, the objective of the
+text-to-video retrieval task is to capture the complementary audio and video
+information that is pertinent to the text query rather than simply achieving
+better audio and video alignment. To address this issue, we introduce TEFAL, a
+TExt-conditioned Feature ALignment method that produces both audio and video
+representations conditioned on the text query. Instead of using only an
+audiovisual attention block, which could suppress the audio information
+relevant to the text query, our approach employs two independent cross-modal
+attention blocks that enable the text to attend to the audio and video
+representations separately. Our proposed method's efficacy is demonstrated on
+four benchmark datasets that include audio: MSR-VTT, LSMDC, VATEX, and
+Charades, and achieves better than state-of-the-art performance consistently
+across the four datasets. This is attributed to the additional
+text-query-conditioned audio representation and the complementary information
+it adds to the text-query-conditioned video representation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Privileged and Convergent Bases in Neural Network Representations <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12941v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12941v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davis Brown, Nikhil Vyas, Yamini Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we investigate whether the representations learned by neural
+networks possess a privileged and convergent basis. Specifically, we examine
+the significance of feature directions represented by individual neurons.
+First, we establish that arbitrary rotations of neural representations cannot
+be inverted (unlike linear networks), indicating that they do not exhibit
+complete rotational invariance. Subsequently, we explore the possibility of
+multiple bases achieving identical performance. To do this, we compare the
+bases of networks trained with the same parameters but with varying random
+initializations. Our study reveals two findings: (1) Even in wide networks such
+as WideResNets, neural networks do not converge to a unique basis; (2) Basis
+correlation increases significantly when a few early layers of the network are
+frozen identically.
+  Furthermore, we analyze Linear Mode Connectivity, which has been studied as a
+measure of basis correlation. Our findings give evidence that while Linear Mode
+Connectivity improves with increased network width, this improvement is not due
+to an increase in basis correlation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In the Workshop on High-dimensional Learning Dynamics at ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Skeleton Meta-Prototype Contrastive Learning with Hard
+  Skeleton Mining for Unsupervised Person Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haocong Rao, Cyril Leung, Chunyan Miao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With rapid advancements in depth sensors and deep learning, skeleton-based
+person re-identification (re-ID) models have recently achieved remarkable
+progress with many advantages. Most existing solutions learn single-level
+skeleton features from body joints with the assumption of equal skeleton
+importance, while they typically lack the ability to exploit more informative
+skeleton features from various levels such as limb level with more global body
+patterns. The label dependency of these methods also limits their flexibility
+in learning more general skeleton representations. This paper proposes a
+generic unsupervised Hierarchical skeleton Meta-Prototype Contrastive learning
+(Hi-MPC) approach with Hard Skeleton Mining (HSM) for person re-ID with
+unlabeled 3D skeletons. Firstly, we construct hierarchical representations of
+skeletons to model coarse-to-fine body and motion features from the levels of
+body joints, components, and limbs. Then a hierarchical meta-prototype
+contrastive learning model is proposed to cluster and contrast the most typical
+skeleton features ("prototypes") from different-level skeletons. By converting
+original prototypes into meta-prototypes with multiple homogeneous
+transformations, we induce the model to learn the inherent consistency of
+prototypes to capture more effective skeleton features for person re-ID.
+Furthermore, we devise a hard skeleton mining mechanism to adaptively infer the
+informative importance of each skeleton, so as to focus on harder skeletons to
+learn more discriminative skeleton representations. Extensive evaluations on
+five datasets demonstrate that our approach outperforms a wide variety of
+state-of-the-art skeleton-based methods. We further show the general
+applicability of our method to cross-view person re-ID and RGB-based scenarios
+with estimated skeletons.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Journal of Computer Vision (IJCV). Codes
+  are available at https://github.com/Kali-Hac/Hi-MPC. Supplemental materials
+  will be included in the published version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards a Visual-Language Foundation Model for Computational Pathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12914v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12914v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Y. Lu, Bowen Chen, Drew F. K. Williamson, Richard J. Chen, Ivy Liang, Tong Ding, Guillaume Jaume, Igor Odintsov, Andrew Zhang, Long Phi Le, Georg Gerber, Anil V Parwani, Faisal Mahmood
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accelerated adoption of digital pathology and advances in deep learning
+have enabled the development of powerful models for various pathology tasks
+across a diverse array of diseases and patient cohorts. However, model training
+is often difficult due to label scarcity in the medical domain and the model's
+usage is limited by the specific task and disease for which it is trained.
+Additionally, most models in histopathology leverage only image data, a stark
+contrast to how humans teach each other and reason about histopathologic
+entities. We introduce CONtrastive learning from Captions for Histopathology
+(CONCH), a visual-language foundation model developed using diverse sources of
+histopathology images, biomedical text, and notably over 1.17 million
+image-caption pairs via task-agnostic pretraining. Evaluated on a suite of 13
+diverse benchmarks, CONCH can be transferred to a wide range of downstream
+tasks involving either or both histopathology images and text, achieving
+state-of-the-art performance on histology image classification, segmentation,
+captioning, text-to-image and image-to-text retrieval. CONCH represents a
+substantial leap over concurrent visual-language pretrained systems for
+histopathology, with the potential to directly facilitate a wide array of
+machine learning-based workflows requiring minimal or no further supervised
+fine-tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dyn-E: Local Appearance Editing of Dynamic Neural Radiance Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12909v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12909v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shangzhan Zhang, Sida Peng, Yinji ShenTu, Qing Shuai, Tianrun Chen, Kaicheng Yu, Hujun Bao, Xiaowei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the editing of neural radiance fields (NeRFs) has gained
+considerable attention, but most prior works focus on static scenes while
+research on the appearance editing of dynamic scenes is relatively lacking. In
+this paper, we propose a novel framework to edit the local appearance of
+dynamic NeRFs by manipulating pixels in a single frame of training video.
+Specifically, to locally edit the appearance of dynamic NeRFs while preserving
+unedited regions, we introduce a local surface representation of the edited
+region, which can be inserted into and rendered along with the original NeRF
+and warped to arbitrary other frames through a learned invertible motion
+representation network. By employing our method, users without professional
+expertise can easily add desired content to the appearance of a dynamic scene.
+We extensively evaluate our approach on various scenes and show that our
+approach achieves spatially and temporally consistent editing results. Notably,
+our approach is versatile and applicable to different variants of dynamic NeRF
+representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page: https://dyn-e.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GridMM: Grid Memory Map for Vision-and-Language Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Wang, Xiangyang Li, Jiahao Yang, Yeqi Liu, Shuqiang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-and-language navigation (VLN) enables the agent to navigate to a
+remote location following the natural language instruction in 3D environments.
+To represent the previously visited environment, most approaches for VLN
+implement memory using recurrent states, topological maps, or top-down semantic
+maps. In contrast to these approaches, we build the top-down egocentric and
+dynamically growing Grid Memory Map (i.e., GridMM) to structure the visited
+environment. From a global perspective, historical observations are projected
+into a unified grid map in a top-down view, which can better represent the
+spatial relations of the environment. From a local perspective, we further
+propose an instruction relevance aggregation method to capture fine-grained
+visual clues in each grid region. Extensive experiments are conducted on both
+the REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE
+dataset in the continuous environments, showing the superiority of our proposed
+method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automotive Object Detection via Learning Sparse Events by Temporal
+  Dynamics of Spiking Neurons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hu Zhang, Luziwei Leng, Kaiwei Che, Qian Liu, Jie Cheng, Qinghai Guo, Jiangxing Liao, Ran Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event-based sensors, with their high temporal resolution (1us) and dynamical
+range (120dB), have the potential to be deployed in high-speed platforms such
+as vehicles and drones. However, the highly sparse and fluctuating nature of
+events poses challenges for conventional object detection techniques based on
+Artificial Neural Networks (ANNs). In contrast, Spiking Neural Networks (SNNs)
+are well-suited for representing event-based data due to their inherent
+temporal dynamics. In particular, we demonstrate that the membrane potential
+dynamics can modulate network activity upon fluctuating events and strengthen
+features of sparse input. In addition, the spike-triggered adaptive threshold
+can stabilize training which further improves network performance. Based on
+this, we develop an efficient spiking feature pyramid network for event-based
+object detection. Our proposed SNN outperforms previous SNNs and sophisticated
+ANNs with attention mechanisms, achieving a mean average precision (map50) of
+47.7% on the Gen1 benchmark dataset. This result significantly surpasses the
+previous best SNN by 9.7% and demonstrates the potential of SNNs for
+event-based vision. Our model has a concise architecture while maintaining high
+accuracy and much lower computation cost as a result of sparse computation. Our
+code will be publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-free Black-box Attack based on Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12872v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12872v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingwen Shao, Lingzhuang Meng, Yuanjian Qiao, Lixu Zhang, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the training data for the target model in a data-free black-box attack
+is not available, most recent schemes utilize GANs to generate data for
+training substitute model. However, these GANs-based schemes suffer from low
+training efficiency as the generator needs to be retrained for each target
+model during the substitute training process, as well as low generation
+quality. To overcome these limitations, we consider utilizing the diffusion
+model to generate data, and propose a data-free black-box attack scheme based
+on diffusion model to improve the efficiency and accuracy of substitute
+training. Despite the data generated by the diffusion model exhibits high
+quality, it presents diverse domain distributions and contains many samples
+that do not meet the discriminative criteria of the target model. To further
+facilitate the diffusion model to generate data suitable for the target model,
+we propose a Latent Code Augmentation (LCA) method to guide the diffusion model
+in generating data. With the guidance of LCA, the data generated by the
+diffusion model not only meets the discriminative criteria of the target model
+but also exhibits high diversity. By utilizing this data, it is possible to
+train substitute model that closely resemble the target model more efficiently.
+Extensive experiments demonstrate that our LCA achieves higher attack success
+rates and requires fewer query budgets compared to GANs-based schemes for
+different target models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding the Latent Space of Diffusion Models through the Lens of
+  Riemannian Geometry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12868v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12868v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong-Hyun Park, Mingi Kwon, Jaewoong Choi, Junghyo Jo, Youngjung Uh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the success of diffusion models (DMs), we still lack a thorough
+understanding of their latent space. To understand the latent space
+$\mathbf{x}_t \in \mathcal{X}$, we analyze them from a geometrical perspective.
+Specifically, we utilize the pullback metric to find the local latent basis in
+$\mathcal{X}$ and their corresponding local tangent basis in $\mathcal{H}$, the
+intermediate feature maps of DMs. The discovered latent basis enables
+unsupervised image editing capability through latent space traversal. We
+investigate the discovered structure from two perspectives. First, we examine
+how geometric structure evolves over diffusion timesteps. Through analysis, we
+show that 1) the model focuses on low-frequency components early in the
+generative process and attunes to high-frequency details later; 2) At early
+timesteps, different samples share similar tangent spaces; and 3) The simpler
+datasets that DMs trained on, the more consistent the tangent space for each
+timestep. Second, we investigate how the geometric structure changes based on
+text conditioning in Stable Diffusion. The results show that 1) similar prompts
+yield comparable tangent spaces; and 2) the model depends less on text
+conditions in later timesteps. To the best of our knowledge, this paper is the
+first to present image editing through $\mathbf{x}$-space traversal and provide
+thorough analyses of the latent structure of DMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Treatment Outcome Prediction for Intracerebral Hemorrhage via Generative
+  Prognostic Model with Imaging and Tabular Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12858v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12858v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenao Ma, Cheng Chen, Jill Abrigo, Calvin Hoi-Kwan Mak, Yuqi Gong, Nga Yan Chan, Chu Han, Zaiyi Liu, Qi Dou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intracerebral hemorrhage (ICH) is the second most common and deadliest form
+of stroke. Despite medical advances, predicting treat ment outcomes for ICH
+remains a challenge. This paper proposes a novel prognostic model that utilizes
+both imaging and tabular data to predict treatment outcome for ICH. Our model
+is trained on observational data collected from non-randomized controlled
+trials, providing reliable predictions of treatment success. Specifically, we
+propose to employ a variational autoencoder model to generate a low-dimensional
+prognostic score, which can effectively address the selection bias resulting
+from the non-randomized controlled trials. Importantly, we develop a
+variational distributions combination module that combines the information from
+imaging data, non-imaging clinical data, and treatment assignment to accurately
+generate the prognostic score. We conducted extensive experiments on a
+real-world clinical dataset of intracerebral hemorrhage. Our proposed method
+demonstrates a substantial improvement in treatment outcome prediction compared
+to existing state-of-the-art approaches. Code is available at
+https://github.com/med-air/TOP-GPM
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiscale Video <span class="highlight-title">Pretrain</span>ing for Long-Term Activity Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reuben Tan, Matthias De Lange, Michael Iuzzolino, Bryan A. Plummer, Kate Saenko, Karl Ridgeway, Lorenzo Torresani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long-term activity forecasting is an especially challenging research problem
+because it requires understanding the temporal relationships between observed
+actions, as well as the variability and complexity of human activities. Despite
+relying on strong supervision via expensive human annotations, state-of-the-art
+forecasting approaches often generalize poorly to unseen data. To alleviate
+this issue, we propose Multiscale Video Pretraining (MVP), a novel
+self-supervised pretraining approach that learns robust representations for
+forecasting by learning to predict contextualized representations of future
+video clips over multiple timescales. MVP is based on our observation that
+actions in videos have a multiscale nature, where atomic actions typically
+occur at a short timescale and more complex actions may span longer timescales.
+We compare MVP to state-of-the-art self-supervised video learning approaches on
+downstream long-term forecasting tasks including long-term action anticipation
+and video summary prediction. Our comprehensive experiments across the Ego4D
+and Epic-Kitchens-55/100 datasets demonstrate that MVP out-performs
+state-of-the-art methods by significant margins. Notably, MVP obtains a
+relative performance gain of over 20% accuracy in video summary forecasting
+over existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatiotemporal Modeling Encounters 3D Medical Image Analysis:
+  Slice-Shift UNet with Multi-View Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        C. I. Ugwu, S. Casarin, O. Lanz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a fundamental part of computational healthcare, Computer Tomography (CT)
+and Magnetic Resonance Imaging (MRI) provide volumetric data, making the
+development of algorithms for 3D image analysis a necessity. Despite being
+computationally cheap, 2D Convolutional Neural Networks can only extract
+spatial information. In contrast, 3D CNNs can extract three-dimensional
+features, but they have higher computational costs and latency, which is a
+limitation for clinical practice that requires fast and efficient models.
+Inspired by the field of video action recognition we propose a new 2D-based
+model dubbed Slice SHift UNet (SSH-UNet) which encodes three-dimensional
+features at 2D CNN's complexity. More precisely multi-view features are
+collaboratively learned by performing 2D convolutions along the three
+orthogonal planes of a volume and imposing a weights-sharing mechanism. The
+third dimension, which is neglected by the 2D convolution, is reincorporated by
+shifting a portion of the feature maps along the slices' axis. The
+effectiveness of our approach is validated in Multi-Modality Abdominal
+Multi-Organ Segmentation (AMOS) and Multi-Atlas Labeling Beyond the Cranial
+Vault (BTCV) datasets, showing that SSH-UNet is more efficient while on par in
+performance with state-of-the-art architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-View Vertebra Localization and Identification from CT Images <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Wu, Jiadong Zhang, Yu Fang, Zhentao Liu, Nizhuan Wang, Zhiming Cui, Dinggang Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately localizing and identifying vertebrae from CT images is crucial for
+various clinical applications. However, most existing efforts are performed on
+3D with cropping patch operation, suffering from the large computation costs
+and limited global information. In this paper, we propose a multi-view vertebra
+localization and identification from CT images, converting the 3D problem into
+a 2D localization and identification task on different views. Without the
+limitation of the 3D cropped patch, our method can learn the multi-view global
+information naturally. Moreover, to better capture the anatomical structure
+information from different view perspectives, a multi-view contrastive learning
+strategy is developed to pre-train the backbone. Additionally, we further
+propose a Sequence Loss to maintain the sequential structure embedded along the
+vertebrae. Evaluation results demonstrate that, with only two 2D networks, our
+method can localize and identify vertebrae in CT images accurately, and
+outperforms the state-of-the-art methods consistently. Our code is available at
+https://github.com/ShanghaiTech-IMPACT/Multi-View-Vertebra-Localization-and-Identification-from-CT-Images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EPIC-KITCHENS-100 Unsupervised Domain Adaptation Challenge: Mixed
+  Sequences Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12837v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12837v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amirshayan Nasirimajd, Simone Alberto Peirone, Chiara Plizzari, Barbara Caputo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report presents the technical details of our approach for the
+EPIC-Kitchens-100 Unsupervised Domain Adaptation (UDA) Challenge in Action
+Recognition. Our approach is based on the idea that the order in which actions
+are performed is similar between the source and target domains. Based on this,
+we generate a modified sequence by randomly combining actions from the source
+and target domains. As only unlabelled target data are available under the UDA
+setting, we use a standard pseudo-labeling strategy for extracting action
+labels for the target. We then ask the network to predict the resulting action
+sequence. This allows to integrate information from both domains during
+training and to achieve better transfer results on target. Additionally, to
+better incorporate sequence information, we use a language model to filter
+unlikely sequences. Lastly, we employed a co-occurrence matrix to eliminate
+unseen combinations of verbs and nouns. Our submission, labeled as 'sshayan',
+can be found on the leaderboard, where it currently holds the 2nd position for
+'verb' and the 4th position for both 'noun' and 'action'.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2nd place in the 2023 EPIC-KITCHENS-100 Unsupervised Domain
+  Adaptation Challenge for Action Recognition</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Provably Robust Estimators for Inverse Problems via Jittering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12822v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12822v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anselm Krainovic, Mahdi Soltanolkotabi, Reinhard Heckel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks provide excellent performance for inverse problems such
+as denoising. However, neural networks can be sensitive to adversarial or
+worst-case perturbations. This raises the question of whether such networks can
+be trained efficiently to be worst-case robust. In this paper, we investigate
+whether jittering, a simple regularization technique that adds isotropic
+Gaussian noise during training, is effective for learning worst-case robust
+estimators for inverse problems. While well studied for prediction in
+classification tasks, the effectiveness of jittering for inverse problems has
+not been systematically investigated. In this paper, we present a novel
+analytical characterization of the optimal $\ell_2$-worst-case robust estimator
+for linear denoising and show that jittering yields optimal robust denoisers.
+Furthermore, we examine jittering empirically via training deep neural networks
+(U-nets) for natural image denoising, deconvolution, and accelerated magnetic
+resonance imaging (MRI). The results show that jittering significantly enhances
+the worst-case robustness, but can be suboptimal for inverse problems beyond
+denoising. Moreover, our results imply that training on real data which often
+contains slight noise is somewhat robustness enhancing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exposing the Troublemakers in Described Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12813v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12813v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chi Xie, Zhao Zhang, Yixuan Wu, Feng Zhu, Rui Zhao, Shuang Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting objects based on language descriptions is a popular task that
+includes Open-Vocabulary object Detection (OVD) and Referring Expression
+Comprehension (REC). In this paper, we advance them to a more practical setting
+called Described Object Detection (DOD) by expanding category names to flexible
+language expressions for OVD and overcoming the limitation of REC to only
+grounding the pre-existing object. We establish the research foundation for DOD
+tasks by constructing a Description Detection Dataset ($D^3$), featuring
+flexible language expressions and annotating all described objects without
+omission. By evaluating previous SOTA methods on $D^3$, we find some
+troublemakers that fail current REC, OVD, and bi-functional methods. REC
+methods struggle with confidence scores, rejecting negative instances, and
+multi-target scenarios, while OVD methods face constraints with long and
+complex descriptions. Recent bi-functional methods also do not work well on DOD
+due to their separated training procedures and inference strategies for REC and
+OVD tasks. Building upon the aforementioned findings, we propose a baseline
+that largely improves REC methods by reconstructing the training data and
+introducing a binary classification sub-task, outperforming existing methods.
+Data and code is available at https://github.com/shikras/d-cube.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Compact & Capable: Harnessing Graph Neural Networks and Edge Convolution
+  for Medical Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aryan Singh, Pepijn Van de Ven, Ciarán Eising, Patrick Denny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph-based neural network models are gaining traction in the field of
+representation learning due to their ability to uncover latent topological
+relationships between entities that are otherwise challenging to identify.
+These models have been employed across a diverse range of domains, encompassing
+drug discovery, protein interactions, semantic segmentation, and fluid dynamics
+research. In this study, we investigate the potential of Graph Neural Networks
+(GNNs) for medical image classification. We introduce a novel model that
+combines GNNs and edge convolution, leveraging the interconnectedness of RGB
+channel feature values to strongly represent connections between crucial graph
+nodes. Our proposed model not only performs on par with state-of-the-art Deep
+Neural Networks (DNNs) but does so with 1000 times fewer parameters, resulting
+in reduced training time and data requirements. We compare our Graph
+Convolutional Neural Network (GCNN) to pre-trained DNNs for classifying
+MedMNIST dataset classes, revealing promising prospects for GNNs in medical
+image analysis. Our results also encourage further exploration of advanced
+graph-based models such as Graph Attention Networks (GAT) and Graph
+Auto-Encoders in the medical imaging domain. The proposed model yields more
+reliable, interpretable, and accurate outcomes for tasks like semantic
+segmentation and image classification compared to simpler GCNNs
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is attention all you need in medical image analysis? A <span class="highlight-title">review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12775v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12775v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgos Papanastasiou, Nikolaos Dikaios, Jiahao Huang, Chengjia Wang, Guang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical imaging is a key component in clinical diagnosis, treatment planning
+and clinical trial design, accounting for almost 90% of all healthcare data.
+CNNs achieved performance gains in medical image analysis (MIA) over the last
+years. CNNs can efficiently model local pixel interactions and be trained on
+small-scale MI data. The main disadvantage of typical CNN models is that they
+ignore global pixel relationships within images, which limits their
+generalisation ability to understand out-of-distribution data with different
+'global' information. The recent progress of Artificial Intelligence gave rise
+to Transformers, which can learn global relationships from data. However, full
+Transformer models need to be trained on large-scale data and involve
+tremendous computational complexity. Attention and Transformer compartments
+(Transf/Attention) which can well maintain properties for modelling global
+relationships, have been proposed as lighter alternatives of full Transformers.
+Recently, there is an increasing trend to co-pollinate complementary
+local-global properties from CNN and Transf/Attention architectures, which led
+to a new era of hybrid models. The past years have witnessed substantial growth
+in hybrid CNN-Transf/Attention models across diverse MIA problems. In this
+systematic review, we survey existing hybrid CNN-Transf/Attention models,
+review and unravel key architectural designs, analyse breakthroughs, and
+evaluate current and future opportunities as well as challenges. We also
+introduced a comprehensive analysis framework on generalisation opportunities
+of scientific and clinical impact, based on which new data-driven domain
+generalisation and adaptation methods can be stimulated.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Full-frame Video Stabilization with Iterative Optimization <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12774v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12774v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiyue Zhao, Xin Li, Zhan Peng, Xianrui Luo, Xinyi Ye, Hao Lu, Zhiguo Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video stabilization refers to the problem of transforming a shaky video into
+a visually pleasing one. The question of how to strike a good trade-off between
+visual quality and computational speed has remained one of the open challenges
+in video stabilization. Inspired by the analogy between wobbly frames and
+jigsaw puzzles, we propose an iterative optimization-based learning approach
+using synthetic datasets for video stabilization, which consists of two
+interacting submodules: motion trajectory smoothing and full-frame outpainting.
+First, we develop a two-level (coarse-to-fine) stabilizing algorithm based on
+the probabilistic flow field. The confidence map associated with the estimated
+optical flow is exploited to guide the search for shared regions through
+backpropagation. Second, we take a divide-and-conquer approach and propose a
+novel multiframe fusion strategy to render full-frame stabilized views. An
+important new insight brought about by our iterative optimization approach is
+that the target video can be interpreted as the fixed point of nonlinear
+mapping for video stabilization. We formulate video stabilization as a problem
+of minimizing the amount of jerkiness in motion trajectories, which guarantees
+convergence with the help of fixed-point theory. Extensive experimental results
+are reported to demonstrate the superiority of the proposed approach in terms
+of computational speed and visual quality. The code will be available on
+GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LiDAR Meta Depth Completion <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12761v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12761v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wolfgang Boettcher, Lukas Hoyer, Ozan Unal, Dengxin Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth estimation is one of the essential tasks to be addressed when creating
+mobile autonomous systems. While monocular depth estimation methods have
+improved in recent times, depth completion provides more accurate and reliable
+depth maps by additionally using sparse depth information from other sensors
+such as LiDAR. However, current methods are specifically trained for a single
+LiDAR sensor. As the scanning pattern differs between sensors, every new sensor
+would require re-training a specialized depth completion model, which is
+computationally inefficient and not flexible. Therefore, we propose to
+dynamically adapt the depth completion model to the used sensor type enabling
+LiDAR adaptive depth completion. Specifically, we propose a meta depth
+completion network that uses data patterns derived from the data to learn a
+task network to alter weights of the main depth completion network to solve a
+given depth completion task effectively. The method demonstrates a strong
+capability to work on multiple LiDAR scanning patterns and can also generalize
+to scanning patterns that are unseen during training. While using a single
+model, our method yields significantly better results than a non-adaptive
+baseline trained on different LiDAR patterns. It outperforms LiDAR-specific
+expert models for very sparse cases. These advantages allow flexible deployment
+of a single depth completion model on different sensors, which could also prove
+valuable to process the input of nascent LiDAR technology with adaptive instead
+of fixed scanning patterns.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ICF-SRSR: Invertible scale-Conditional Function for <span class="highlight-title">Self-Supervised</span>
+  Real-world Single Image Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reyhaneh Neshatavar, Mohsen Yavartanoo, Sanghyun Son, Kyoung Mu Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single image super-resolution (SISR) is a challenging ill-posed problem that
+aims to up-sample a given low-resolution (LR) image to a high-resolution (HR)
+counterpart. Due to the difficulty in obtaining real LR-HR training pairs,
+recent approaches are trained on simulated LR images degraded by simplified
+down-sampling operators, e.g., bicubic. Such an approach can be problematic in
+practice because of the large gap between the synthesized and real-world LR
+images. To alleviate the issue, we propose a novel Invertible scale-Conditional
+Function (ICF), which can scale an input image and then restore the original
+input with different scale conditions. By leveraging the proposed ICF, we
+construct a novel self-supervised SISR framework (ICF-SRSR) to handle the
+real-world SR task without using any paired/unpaired training data.
+Furthermore, our ICF-SRSR can generate realistic and feasible LR-HR pairs,
+which can make existing supervised SISR networks more robust. Extensive
+experiments demonstrate the effectiveness of the proposed method in handling
+SISR in a fully self-supervised manner. Our ICF-SRSR demonstrates superior
+performance compared to the existing methods trained on synthetic paired images
+in real-world scenarios and exhibits comparable performance compared to
+state-of-the-art supervised/unsupervised methods on public benchmark datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLIP-KD: An Empirical Study of Distilling CLIP Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuanguang Yang, Zhulin An, Libo Huang, Junyu Bi, Xinqiang Yu, Han Yang, Yongjun Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CLIP has become a promising language-supervised visual pre-training framework
+and achieves excellent performance over a wide range of tasks. This paper aims
+to distill small CLIP models supervised by a large teacher CLIP model. We
+propose several distillation strategies, including relation, feature, gradient
+and contrastive paradigm, to examine the impact on CLIP distillation. We show
+that the simplest feature mimicry with MSE loss performs best. Moreover,
+interactive contrastive learning and relation-based distillation are also
+critical in performance improvement. We apply the unified method to distill
+several student networks trained on 15 million (image, text) pairs.
+Distillation improves the student CLIP models consistently over zero-shot
+ImageNet classification and cross-modal retrieval benchmarks. We hope our
+empirical study will become an important baseline for future CLIP distillation
+research. The code is available at \url{https://github.com/winycg/CLIP-KD}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ COCO-O: A Benchmark for Object Detectors under Natural Distribution
+  Shifts <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12730v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12730v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaofeng Mao, Yuefeng Chen, Yao Zhu, Da Chen, Hang Su, Rong Zhang, Hui Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Practical object detection application can lose its effectiveness on image
+inputs with natural distribution shifts. This problem leads the research
+community to pay more attention on the robustness of detectors under
+Out-Of-Distribution (OOD) inputs. Existing works construct datasets to
+benchmark the detector's OOD robustness for a specific application scenario,
+e.g., Autonomous Driving. However, these datasets lack universality and are
+hard to benchmark general detectors built on common tasks such as COCO. To give
+a more comprehensive robustness assessment, we introduce
+COCO-O(ut-of-distribution), a test dataset based on COCO with 6 types of
+natural distribution shifts. COCO-O has a large distribution gap with training
+data and results in a significant 55.7% relative performance drop on a Faster
+R-CNN detector. We leverage COCO-O to conduct experiments on more than 100
+modern object detectors to investigate if their improvements are credible or
+just over-fitting to the COCO test set. Unfortunately, most classic detectors
+in early years do not exhibit strong OOD generalization. We further study the
+robustness effect on recent breakthroughs of detector's architecture design,
+augmentation and pre-training techniques. Some empirical findings are revealed:
+1) Compared with detection head or neck, backbone is the most important part
+for robustness; 2) An end-to-end detection transformer design brings no
+enhancement, and may even reduce robustness; 3) Large-scale foundation models
+have made a great leap on robust object detection. We hope our COCO-O could
+provide a rich testbed for robustness study of object detection. The dataset
+will be available at
+\url{https://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ICCV2023,
+  https://github.com/alibaba/easyrobust/tree/main/benchmarks/coco_o</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Persistent-Transient Duality: A Multi-mechanism Approach for Modeling
+  Human-Object Interaction <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hung Tran, Vuong Le, Svetha Venkatesh, Truyen Tran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans are highly adaptable, swiftly switching between different modes to
+progressively handle different tasks, situations and contexts. In Human-object
+interaction (HOI) activities, these modes can be attributed to two mechanisms:
+(1) the large-scale consistent plan for the whole activity and (2) the
+small-scale children interactive actions that start and end along the timeline.
+While neuroscience and cognitive science have confirmed this multi-mechanism
+nature of human behavior, machine modeling approaches for human motion are
+trailing behind. While attempted to use gradually morphing structures (e.g.,
+graph attention networks) to model the dynamic HOI patterns, they miss the
+expeditious and discrete mode-switching nature of the human motion. To bridge
+that gap, this work proposes to model two concurrent mechanisms that jointly
+control human motion: the Persistent process that runs continually on the
+global scale, and the Transient sub-processes that operate intermittently on
+the local context of the human while interacting with objects. These two
+mechanisms form an interactive Persistent-Transient Duality that
+synergistically governs the activity sequences. We model this conceptual
+duality by a parent-child neural network of Persistent and Transient channels
+with a dedicated neural module for dynamic mechanism switching. The framework
+is trialed on HOI motion forecasting. On two rich datasets and a wide variety
+of settings, the model consistently delivers superior performances, proving its
+suitability for the challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AMAE: Adaptation of <span class="highlight-title">Pre-Train</span>ed Masked Autoencoder for Dual-Distribution
+  Anomaly Detection in Chest X-Rays <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Behzad Bozorgtabar, Dwarikanath Mahapatra, Jean-Philippe Thiran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised anomaly detection in medical images such as chest radiographs is
+stepping into the spotlight as it mitigates the scarcity of the labor-intensive
+and costly expert annotation of anomaly data. However, nearly all existing
+methods are formulated as a one-class classification trained only on
+representations from the normal class and discard a potentially significant
+portion of the unlabeled data. This paper focuses on a more practical setting,
+dual distribution anomaly detection for chest X-rays, using the entire training
+data, including both normal and unlabeled images. Inspired by a modern
+self-supervised vision transformer model trained using partial image inputs to
+reconstruct missing image regions -- we propose AMAE, a two-stage algorithm for
+adaptation of the pre-trained masked autoencoder (MAE). Starting from MAE
+initialization, AMAE first creates synthetic anomalies from only normal
+training images and trains a lightweight classifier on frozen transformer
+features. Subsequently, we propose an adaptation strategy to leverage unlabeled
+images containing anomalies. The adaptation scheme is accomplished by assigning
+pseudo-labels to unlabeled images and using two separate MAE based modules to
+model the normative and anomalous distributions of pseudo-labeled images. The
+effectiveness of the proposed adaptation strategy is evaluated with different
+anomaly ratios in an unlabeled training set. AMAE leads to consistent
+performance gains over competing self-supervised and dual distribution anomaly
+detection methods, setting the new state-of-the-art on three public chest X-ray
+benchmarks: RSNA, NIH-CXR, and VinDr-CXR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CarPatch: A Synthetic Benchmark for Radiance Field Evaluation on Vehicle
+  Components 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12718v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12718v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Di Nucci, Alessandro Simoni, Matteo Tomei, Luca Ciuffreda, Roberto Vezzani, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRFs) have gained widespread recognition as a highly
+effective technique for representing 3D reconstructions of objects and scenes
+derived from sets of images. Despite their efficiency, NeRF models can pose
+challenges in certain scenarios such as vehicle inspection, where the lack of
+sufficient data or the presence of challenging elements (e.g. reflections)
+strongly impact the accuracy of the reconstruction. To this aim, we introduce
+CarPatch, a novel synthetic benchmark of vehicles. In addition to a set of
+images annotated with their intrinsic and extrinsic camera parameters, the
+corresponding depth maps and semantic segmentation masks have been generated
+for each view. Global and part-based metrics have been defined and used to
+evaluate, compare, and better characterize some state-of-the-art techniques.
+The dataset is publicly released at
+https://aimagelab.ing.unimore.it/go/carpatch and can be used as an evaluation
+guide and as a baseline for future work on this challenging topic.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICIAP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dense <span class="highlight-title">Transformer</span> based Enhanced Coding Network for Unsupervised Metal
+  Artifact Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wangduo Xie, Matthew B. Blaschko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CT images corrupted by metal artifacts have serious negative effects on
+clinical diagnosis. Considering the difficulty of collecting paired data with
+ground truth in clinical settings, unsupervised methods for metal artifact
+reduction are of high interest. However, it is difficult for previous
+unsupervised methods to retain structural information from CT images while
+handling the non-local characteristics of metal artifacts. To address these
+challenges, we proposed a novel Dense Transformer based Enhanced Coding Network
+(DTEC-Net) for unsupervised metal artifact reduction. Specifically, we
+introduce a Hierarchical Disentangling Encoder, supported by the high-order
+dense process, and transformer to obtain densely encoded sequences with
+long-range correspondence. Then, we present a second-order disentanglement
+method to improve the dense sequence's decoding process. Extensive experiments
+and model discussions illustrate DTEC-Net's effectiveness, which outperforms
+the previous state-of-the-art methods on a benchmark dataset, and greatly
+reduces metal artifacts while restoring richer texture details.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> MC-JEPA: A Joint-Embedding Predictive Architecture for <span class="highlight-title">Self-Supervised</span>
+  Learning of Motion and Content Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrien Bardes, Jean Ponce, <span class="highlight-author">Yann LeCun</span>
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning of visual representations has been focusing on
+learning content features, which do not capture object motion or location, and
+focus on identifying and differentiating objects in images and videos. On the
+other hand, optical flow estimation is a task that does not involve
+understanding the content of the images on which it is estimated. We unify the
+two approaches and introduce MC-JEPA, a joint-embedding predictive architecture
+and self-supervised learning approach to jointly learn optical flow and content
+features within a shared encoder, demonstrating that the two associated
+objectives; the optical flow estimation objective and the self-supervised
+learning objective; benefit from each other and thus learn content features
+that incorporate motion information. The proposed approach achieves performance
+on-par with existing unsupervised optical flow benchmarks, as well as with
+common self-supervised learning approaches on downstream tasks such as semantic
+segmentation of images and videos.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Damage Vision Mining Opportunity for Imbalanced Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12676v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12676v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takato Yasuno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In past decade, previous balanced datasets have been used to advance
+algorithms for classification, object detection, semantic segmentation, and
+anomaly detection in industrial applications. Specifically, for condition-based
+maintenance, automating visual inspection is crucial to ensure high quality.
+Deterioration prognostic attempts to optimize the fine decision process for
+predictive maintenance and proactive repair. In civil infrastructure and living
+environment, damage data mining cannot avoid the imbalanced data issue because
+of rare unseen events and high quality status by improved operations. For
+visual inspection, deteriorated class acquired from the surface of concrete and
+steel components are occasionally imbalanced. From numerous related surveys, we
+summarize that imbalanced data problems can be categorized into four types; 1)
+missing range of target and label valuables, 2) majority-minority class
+imbalance, 3) foreground-background of spatial imbalance, 4) long-tailed class
+of pixel-wise imbalance. Since 2015, there has been many imbalanced studies
+using deep learning approaches that includes regression, image classification,
+object detection, semantic segmentation. However, anomaly detection for
+imbalanced data is not yet well known. In the study, we highlight one-class
+anomaly detection application whether anomalous class or not, and demonstrate
+clear examples on imbalanced vision datasets: wooden, concrete deterioration,
+and disaster damage. We provide key results on damage vision mining advantage,
+hypothesizing that the more effective range of positive ratio, the higher
+accuracy gain of anomaly detection application. Finally, the applicability of
+the damage learning methods, limitations, and future works are mentioned.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 14 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Industrial Segment Anything -- a Case Study in Aircraft Manufacturing,
+  Intralogistics, Maintenance, Repair, and Overhaul 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12674v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12674v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keno Moenck, Arne Wendt, Philipp Prünte, Julian Koch, Arne Sahrhage, Johann Gierecker, Ole Schmedemann, Falko Kähler, Dirk Holst, Martin Gomse, Thorsten Schüppstuhl, Daniel Schoepflin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying deep learning-based applications in specialized domains like the
+aircraft production industry typically suffers from the training data
+availability problem. Only a few datasets represent non-everyday objects,
+situations, and tasks. Recent advantages in research around Vision Foundation
+Models (VFM) opened a new area of tasks and models with high generalization
+capabilities in non-semantic and semantic predictions. As recently demonstrated
+by the Segment Anything Project, exploiting VFM's zero-shot capabilities is a
+promising direction in tackling the boundaries spanned by data, context, and
+sensor variety. Although, investigating its application within specific domains
+is subject to ongoing research. This paper contributes here by surveying
+applications of the SAM in aircraft production-specific use cases. We include
+manufacturing, intralogistics, as well as maintenance, repair, and overhaul
+processes, also representing a variety of other neighboring industrial domains.
+Besides presenting the various use cases, we further discuss the injection of
+domain knowledge.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked
+  Image Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhen Pan, Suprosanna Shit, Özgün Turgut, Wenqi Huang, Hongwei Bran Li, Nil Stolt-Ansó, Thomas Küstner, Kerstin Hammernik, Daniel Rueckert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In dynamic Magnetic Resonance Imaging (MRI), k-space is typically
+undersampled due to limited scan time, resulting in aliasing artifacts in the
+image domain. Hence, dynamic MR reconstruction requires not only modeling
+spatial frequency components in the x and y directions of k-space but also
+considering temporal redundancy. Most previous works rely on image-domain
+regularizers (priors) to conduct MR reconstruction. In contrast, we focus on
+interpolating the undersampled k-space before obtaining images with Fourier
+transform. In this work, we connect masked image modeling with k-space
+interpolation and propose a novel Transformer-based k-space Global
+Interpolation Network, termed k-GIN. Our k-GIN learns global dependencies among
+low- and high-frequency components of 2D+t k-space and uses it to interpolate
+unsampled data. Further, we propose a novel k-space Iterative Refinement Module
+(k-IRM) to enhance the high-frequency components learning. We evaluate our
+approach on 92 in-house 2D+t cardiac MR subjects and compare it to MR
+reconstruction methods with image-domain regularizers. Experiments show that
+our proposed k-space interpolation method quantitatively and qualitatively
+outperforms baseline methods. Importantly, the proposed approach achieves
+substantially higher robustness and generalizability in cases of
+highly-undersampled MR data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Theoretically Guaranteed Quaternion Weighted Schatten p-norm
+  Minimization Method for Color Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12656v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12656v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qing-Hua Zhang, Liang-Tian He, Yi-Lun Wang, Liang-Jian Deng, Jun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inspired by the fact that the matrix formulated by nonlocal similar patches
+in a natural image is of low rank, the rank approximation issue have been
+extensively investigated over the past decades, among which weighted nuclear
+norm minimization (WNNM) and weighted Schatten $p$-norm minimization (WSNM) are
+two prevailing methods have shown great superiority in various image
+restoration (IR) problems. Due to the physical characteristic of color images,
+color image restoration (CIR) is often a much more difficult task than its
+grayscale image counterpart. However, when applied to CIR, the traditional
+WNNM/WSNM method only processes three color channels individually and fails to
+consider their cross-channel correlations. Very recently, a quaternion-based
+WNNM approach (QWNNM) has been developed to mitigate this issue, which is
+capable of representing the color image as a whole in the quaternion domain and
+preserving the inherent correlation among the three color channels. Despite its
+empirical success, unfortunately, the convergence behavior of QWNNM has not
+been strictly studied yet. In this paper, on the one side, we extend the WSNM
+into quaternion domain and correspondingly propose a novel quaternion-based
+WSNM model (QWSNM) for tackling the CIR problems. Extensive experiments on two
+representative CIR tasks, including color image denoising and deblurring,
+demonstrate that the proposed QWSNM method performs favorably against many
+state-of-the-art alternatives, in both quantitative and qualitative
+evaluations. On the other side, more importantly, we preliminarily provide a
+theoretical convergence analysis, that is, by modifying the quaternion
+alternating direction method of multipliers (QADMM) through a simple
+continuation strategy, we theoretically prove that both the solution sequences
+generated by the QWNNM and QWSNM have fixed-point convergence guarantees.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>46 pages, 10 figures; references added</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation
+  of rPPG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12644v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12644v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dae Yeol Kim, Eunsu Goh, KwangKee Lee, JongEui Chae, JongHyeon Mun, Junyeong Na, Chae-bong Sohn, Do-Yup Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote Photoplethysmography (rPPG) is a technology that utilizes the light
+absorption properties of hemoglobin, captured via camera, to analyze and
+measure blood volume pulse (BVP). By analyzing the measured BVP, various
+physiological signals such as heart rate, stress levels, and blood pressure can
+be derived, enabling applications such as the early prediction of
+cardiovascular diseases. rPPG is a rapidly evolving field as it allows the
+measurement of vital signals using camera-equipped devices without the need for
+additional devices such as blood pressure monitors or pulse oximeters, and
+without the assistance of medical experts. Despite extensive efforts and
+advances in this field, serious challenges remain, including issues related to
+skin color, camera characteristics, ambient lighting, and other sources of
+noise, which degrade performance accuracy. We argue that fair and evaluable
+benchmarking is urgently required to overcome these challenges and make any
+meaningful progress from both academic and commercial perspectives. In most
+existing work, models are trained, tested, and validated only on limited
+datasets. Worse still, some studies lack available code or reproducibility,
+making it difficult to fairly evaluate and compare performance. Therefore, the
+purpose of this study is to provide a benchmarking framework to evaluate
+various rPPG techniques across a wide range of datasets for fair evaluation and
+comparison, including both conventional non-deep neural network (non-DNN) and
+deep neural network (DNN) methods. GitHub URL:
+https://github.com/remotebiosensing/rppg.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PG-RCNN: Semantic Surface Point Generation for 3D Object Detection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12637v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12637v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Inyong Koo, Inyoung Lee, Se-Ho Kim, Hee-Seon Kim, Woo-jin Jeon, Changick Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the main challenges in LiDAR-based 3D object detection is that the
+sensors often fail to capture the complete spatial information about the
+objects due to long distance and occlusion. Two-stage detectors with point
+cloud completion approaches tackle this problem by adding more points to the
+regions of interest (RoIs) with a pre-trained network. However, these methods
+generate dense point clouds of objects for all region proposals, assuming that
+objects always exist in the RoIs. This leads to the indiscriminate point
+generation for incorrect proposals as well. Motivated by this, we propose Point
+Generation R-CNN (PG-RCNN), a novel end-to-end detector that generates semantic
+surface points of foreground objects for accurate detection. Our method uses a
+jointly trained RoI point generation module to process the contextual
+information of RoIs and estimate the complete shape and displacement of
+foreground objects. For every generated point, PG-RCNN assigns a semantic
+feature that indicates the estimated foreground probability. Extensive
+experiments show that the point clouds generated by our method provide
+geometrically and semantically rich information for refining false positive and
+misaligned proposals. PG-RCNN achieves competitive performance on the KITTI
+benchmark, with significantly fewer parameters than state-of-the-art models.
+The code is available at https://github.com/quotation2520/PG-RCNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic lobe segmentation using attentive cross entropy and end-to-end
+  fissure generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12634v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12634v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Su, Na Wang, Jiawen Xie, Yinan Chen, Xiaofan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The automatic lung lobe segmentation algorithm is of great significance for
+the diagnosis and treatment of lung diseases, however, which has great
+challenges due to the incompleteness of pulmonary fissures in lung CT images
+and the large variability of pathological features. Therefore, we propose a new
+automatic lung lobe segmentation framework, in which we urge the model to pay
+attention to the area around the pulmonary fissure during the training process,
+which is realized by a task-specific loss function. In addition, we introduce
+an end-to-end pulmonary fissure generation method in the auxiliary pulmonary
+fissure segmentation task, without any additional network branch. Finally, we
+propose a registration-based loss function to alleviate the convergence
+difficulty of the Dice loss supervised pulmonary fissure segmentation task. We
+achieve 97.83% and 94.75% dice scores on our private dataset STLB and public
+LUNA16 dataset respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, published to 'IEEE International Symposium on
+  Biomedical Imaging (ISBI) 2023'</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semi-Supervised Medical Image Segmentation with Co-Distribution
+  Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12630v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12630v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Wang, Zhongzheng Huang, Jiawei Wu, Yuanzheng Cai, Zuoyong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation has made significant progress when a large amount
+of labeled data are available. However, annotating medical image segmentation
+datasets is expensive due to the requirement of professional skills.
+Additionally, classes are often unevenly distributed in medical images, which
+severely affects the classification performance on minority classes. To address
+these problems, this paper proposes Co-Distribution Alignment (Co-DA) for
+semi-supervised medical image segmentation. Specifically, Co-DA aligns marginal
+predictions on unlabeled data to marginal predictions on labeled data in a
+class-wise manner with two differently initialized models before using the
+pseudo-labels generated by one model to supervise the other. Besides, we design
+an over-expectation cross-entropy loss for filtering the unlabeled pixels to
+reduce noise in their pseudo-labels. Quantitative and qualitative experiments
+on three public datasets demonstrate that the proposed approach outperforms
+existing state-of-the-art semi-supervised medical image segmentation methods on
+both the 2D CaDIS dataset and the 3D LGE-MRI and ACDC datasets, achieving an
+mIoU of 0.8515 with only 24% labeled data on CaDIS, and a Dice score of 0.8824
+and 0.8773 with only 20% data on LGE-MRI and ACDC, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper appears in Bioengineering 2023, 10(7), 869</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Phase Match for Out-of-Distribution Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12622v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12622v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengming Hu, Rui Wang, Hao Chen, Zhouwang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Fourier transform, serving as an explicit decomposition method for visual
+signals, has been employed to explain the out-of-distribution generalization
+behaviors of Convolutional Neural Networks (CNNs). Previous research and
+empirical studies have indicated that the amplitude spectrum plays a decisive
+role in CNN recognition, but it is susceptible to disturbance caused by
+distribution shifts. On the other hand, the phase spectrum preserves
+highly-structured spatial information, which is crucial for visual
+representation learning. In this paper, we aim to clarify the relationships
+between Domain Generalization (DG) and the frequency components by introducing
+a Fourier-based structural causal model. Specifically, we interpret the phase
+spectrum as semi-causal factors and the amplitude spectrum as non-causal
+factors. Building upon these observations, we propose Phase Match (PhaMa) to
+address DG problems. Our method introduces perturbations on the amplitude
+spectrum and establishes spatial relationships to match the phase components.
+Through experiments on multiple benchmarks, we demonstrate that our proposed
+method achieves state-of-the-art performance in domain generalization and
+out-of-distribution robustness tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse annotation strategies for segmentation of short axis cardiac MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12619v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12619v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josh Stein, Maxime Di Folco, Julia Schnabel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Short axis cardiac MRI segmentation is a well-researched topic, with
+excellent results achieved by state-of-the-art models in a supervised setting.
+However, annotating MRI volumes is time-consuming and expensive. Many different
+approaches (e.g. transfer learning, data augmentation, few-shot learning, etc.)
+have emerged in an effort to use fewer annotated data and still achieve similar
+performance as a fully supervised model. Nevertheless, to the best of our
+knowledge, none of these works focus on which slices of MRI volumes are most
+important to annotate for yielding the best segmentation results. In this
+paper, we investigate the effects of training with sparse volumes, i.e.
+reducing the number of cases annotated, and sparse annotations, i.e. reducing
+the number of slices annotated per case. We evaluate the segmentation
+performance using the state-of-the-art nnU-Net model on two public datasets to
+identify which slices are the most important to annotate. We have shown that
+training on a significantly reduced dataset (48 annotated volumes) can give a
+Dice score greater than 0.85 and results comparable to using the full dataset
+(160 and 240 volumes for each dataset respectively). In general, training on
+more slice annotations provides more valuable information compared to training
+on more volumes. Further, annotating slices from the middle of volumes yields
+the most beneficial results in terms of segmentation performance, and the
+apical region the worst. When evaluating the trade-off between annotating
+volumes against slices, annotating as many slices as possible instead of
+annotating more volumes is a better strategy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attribute Regularized Soft Introspective VAE: Towards Cardiac Attribute
+  Regularization Through MRI Domains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12618v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12618v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxime Di Folco, Cosmin Bercea, Julia A. Schnabel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep generative models have emerged as influential instruments for data
+generation and manipulation. Enhancing the controllability of these models by
+selectively modifying data attributes has been a recent focus. Variational
+Autoencoders (VAEs) have shown promise in capturing hidden attributes but often
+produce blurry reconstructions. Controlling these attributes through different
+imaging domains is difficult in medical imaging. Recently, Soft Introspective
+VAE leverage the benefits of both VAEs and Generative Adversarial Networks
+(GANs), which have demonstrated impressive image synthesis capabilities, by
+incorporating an adversarial loss into VAE training. In this work, we propose
+the Attributed Soft Introspective VAE (Attri-SIVAE) by incorporating an
+attribute regularized loss, into the Soft-Intro VAE framework. We evaluate
+experimentally the proposed method on cardiac MRI data from different domains,
+such as various scanner vendors and acquisition centers. The proposed method
+achieves similar performance in terms of reconstruction and regularization
+compared to the state-of-the-art Attributed regularized VAE but additionally
+also succeeds in keeping the same regularization level when tested on a
+different dataset, unlike the compared method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CTVIS: Consistent Training for Online Video Instance Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12616v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12616v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaining Ying, Qing Zhong, Weian Mao, Zhenhua Wang, Hao Chen, Lin Yuanbo Wu, Yifan Liu, Chengxiang Fan, Yunzhi Zhuge, Chunhua Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The discrimination of instance embeddings plays a vital role in associating
+instances across time for online video instance segmentation (VIS). Instance
+embedding learning is directly supervised by the contrastive loss computed upon
+the contrastive items (CIs), which are sets of anchor/positive/negative
+embeddings. Recent online VIS methods leverage CIs sourced from one reference
+frame only, which we argue is insufficient for learning highly discriminative
+embeddings. Intuitively, a possible strategy to enhance CIs is replicating the
+inference phase during training. To this end, we propose a simple yet effective
+training strategy, called Consistent Training for Online VIS (CTVIS), which
+devotes to aligning the training and inference pipelines in terms of building
+CIs. Specifically, CTVIS constructs CIs by referring inference the
+momentum-averaged embedding and the memory bank storage mechanisms, and adding
+noise to the relevant embeddings. Such an extension allows a reliable
+comparison between embeddings of current instances and the stable
+representations of historical instances, thereby conferring an advantage in
+modeling VIS challenges such as occlusion, re-identification, and deformation.
+Empirically, CTVIS outstrips the SOTA VIS models by up to +5.0 points on three
+VIS benchmarks, including YTVIS19 (55.1% AP), YTVIS21 (50.1% AP) and OVIS
+(35.5% AP). Furthermore, we find that pseudo-videos transformed from images can
+train robust models surpassing fully-supervised ones.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. The code is available at
+  https://github.com/KainingYing/CTVIS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Less is More: Focus Attention for Efficient DETR <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12612v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12612v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dehua Zheng, Wenhui Dong, Hailin Hu, Xinghao Chen, Yunhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DETR-like models have significantly boosted the performance of detectors and
+even outperformed classical convolutional models. However, all tokens are
+treated equally without discrimination brings a redundant computational burden
+in the traditional encoder structure. The recent sparsification strategies
+exploit a subset of informative tokens to reduce attention complexity
+maintaining performance through the sparse encoder. But these methods tend to
+rely on unreliable model statistics. Moreover, simply reducing the token
+population hinders the detection performance to a large extent, limiting the
+application of these sparse models. We propose Focus-DETR, which focuses
+attention on more informative tokens for a better trade-off between computation
+efficiency and model accuracy. Specifically, we reconstruct the encoder with
+dual attention, which includes a token scoring mechanism that considers both
+localization and category semantic information of the objects from multi-scale
+feature maps. We efficiently abandon the background queries and enhance the
+semantic interaction of the fine-grained object queries based on the scores.
+Compared with the state-of-the-art sparse DETR-like detectors under the same
+setting, our Focus-DETR gets comparable complexity while achieving 50.4AP
+(+2.2) on COCO. The code is available at
+https://github.com/huawei-noah/noah-research/tree/master/Focus-DETR and
+https://gitee.com/mindspore/models/tree/master/research/cv/Focus-DETR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, accepted to ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ExWarp: Extrapolation and Warping-based Temporal Supersampling for
+  High-frequency Displays 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12607v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12607v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akanksha Dixit, Yashashwee Chakrabarty, Smruti R. Sarangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-frequency displays are gaining immense popularity because of their
+increasing use in video games and virtual reality applications. However, the
+issue is that the underlying GPUs cannot continuously generate frames at this
+high rate -- this results in a less smooth and responsive experience.
+Furthermore, if the frame rate is not synchronized with the refresh rate, the
+user may experience screen tearing and stuttering. Previous works propose
+increasing the frame rate to provide a smooth experience on modern displays by
+predicting new frames based on past or future frames. Interpolation and
+extrapolation are two widely used algorithms that predict new frames.
+Interpolation requires waiting for the future frame to make a prediction, which
+adds additional latency. On the other hand, extrapolation provides a better
+quality of experience because it relies solely on past frames -- it does not
+incur any additional latency. The simplest method to extrapolate a frame is to
+warp the previous frame using motion vectors; however, the warped frame may
+contain improperly rendered visual artifacts due to dynamic objects -- this
+makes it very challenging to design such a scheme. Past work has used DNNs to
+get good accuracy, however, these approaches are slow. This paper proposes
+Exwarp -- an approach based on reinforcement learning (RL) to intelligently
+choose between the slower DNN-based extrapolation and faster warping-based
+methods to increase the frame rate by 4x with an almost negligible reduction in
+the perceived image quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SwinMM: Masked Multi-view with Swin <span class="highlight-title">Transformer</span>s for 3D Medical Image
+  Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12591v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12591v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiqing Wang, Zihan Li, Jieru Mei, Zihao Wei, Li Liu, Chen Wang, Shengtian Sang, Alan Yuille, Cihang Xie, Yuyin Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large-scale Vision Transformers have made significant
+strides in improving pre-trained models for medical image segmentation.
+However, these methods face a notable challenge in acquiring a substantial
+amount of pre-training data, particularly within the medical field. To address
+this limitation, we present Masked Multi-view with Swin Transformers (SwinMM),
+a novel multi-view pipeline for enabling accurate and data-efficient
+self-supervised medical image analysis. Our strategy harnesses the potential of
+multi-view information by incorporating two principal components. In the
+pre-training phase, we deploy a masked multi-view encoder devised to
+concurrently train masked multi-view observations through a range of diverse
+proxy tasks. These tasks span image reconstruction, rotation, contrastive
+learning, and a novel task that employs a mutual learning paradigm. This new
+task capitalizes on the consistency between predictions from various
+perspectives, enabling the extraction of hidden multi-view information from 3D
+medical data. In the fine-tuning stage, a cross-view decoder is developed to
+aggregate the multi-view information through a cross-attention block. Compared
+with the previous state-of-the-art self-supervised learning method Swin UNETR,
+SwinMM demonstrates a notable advantage on several medical image segmentation
+tasks. It allows for a smooth integration of multi-view information,
+significantly boosting both the accuracy and data-efficiency of the model. Code
+and models are available at https://github.com/UCSC-VLAA/SwinMM/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023; project page: https://github.com/UCSC-VLAA/SwinMM/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SL: Stable Learning in Source-Free Domain Adaption for Medical Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12580v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12580v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixin Chen, Yan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning techniques for medical image analysis usually suffer from the
+domain shift between source and target data. Most existing works focus on
+unsupervised domain adaptation (UDA). However, in practical applications,
+privacy issues are much more severe. For example, the data of different
+hospitals have domain shifts due to equipment problems, and data of the two
+domains cannot be available simultaneously because of privacy. In this
+challenge defined as Source-Free UDA, the previous UDA medical methods are
+limited. Although a variety of medical source-free unsupervised domain adaption
+(MSFUDA) methods have been proposed, we found they fall into an over-fitting
+dilemma called "longer training, worse performance." Therefore, we propose the
+Stable Learning (SL) strategy to address the dilemma. SL is a scalable method
+and can be integrated with other research, which consists of Weight
+Consolidation and Entropy Increase. First, we apply Weight Consolidation to
+retain domain-invariant knowledge and then we design Entropy Increase to avoid
+over-learning. Comparative experiments prove the effectiveness of SL. We also
+have done extensive ablation experiments. Besides, We will release codes
+including a variety of MSFUDA methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PRIOR: Prototype Representation Joint Learning from Medical Images and
+  Reports <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12577v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12577v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pujin Cheng, Li Lin, Junyan Lyu, Yijin Huang, Wenhan Luo, Xiaoying Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning based vision-language joint pre-training has emerged as
+a successful representation learning strategy. In this paper, we present a
+prototype representation learning framework incorporating both global and local
+alignment between medical images and reports. In contrast to standard global
+multi-modality alignment methods, we employ a local alignment module for
+fine-grained representation. Furthermore, a cross-modality conditional
+reconstruction module is designed to interchange information across modalities
+in the training phase by reconstructing masked images and reports. For
+reconstructing long reports, a sentence-wise prototype memory bank is
+constructed, enabling the network to focus on low-level localized visual and
+high-level clinical linguistic features. Additionally, a non-auto-regressive
+generation paradigm is proposed for reconstructing non-sequential reports.
+Experimental results on five downstream tasks, including supervised
+classification, zero-shot classification, image-to-text retrieval, semantic
+segmentation, and object detection, show the proposed method outperforms other
+state-of-the-art methods across multiple datasets and under different dataset
+size settings. The code is available at https://github.com/QtacierP/PRIOR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Good Student is Cooperative and Reliable: CNN-<span class="highlight-title">Transformer</span>
+  Collaborative Learning for Semantic Segmentation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12574v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12574v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinjing Zhu, Yunhao Luo, Xu Zheng, Hao Wang, Lin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we strive to answer the question "how to collaboratively learn
+convolutional neural network (CNN)-based and vision transformer (ViT)-based
+models by selecting and exchanging the reliable knowledge between them for
+semantic segmentation?" Accordingly, we propose an online knowledge
+distillation (KD) framework that can simultaneously learn compact yet effective
+CNN-based and ViT-based models with two key technical breakthroughs to take
+full advantage of CNNs and ViT while compensating their limitations. Firstly,
+we propose heterogeneous feature distillation (HFD) to improve students'
+consistency in low-layer feature space by mimicking heterogeneous features
+between CNNs and ViT. Secondly, to facilitate the two students to learn
+reliable knowledge from each other, we propose bidirectional selective
+distillation (BSD) that can dynamically transfer selective knowledge. This is
+achieved by 1) region-wise BSD determining the directions of knowledge
+transferred between the corresponding regions in the feature space and 2)
+pixel-wise BSD discerning which of the prediction knowledge to be transferred
+in the logit space. Extensive experiments on three benchmark datasets
+demonstrate that our proposed framework outperforms the state-of-the-art online
+distillation methods by a large margin, and shows its efficacy in learning
+collaboratively between ViT-based and CNN-based models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MataDoc: Margin and Text Aware Document Dewarping for Arbitrary Boundary 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12571v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12571v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beiya Dai, Xing li, Qunyi Xie, Yulin Li, Xiameng Qin, Chengquan Zhang, Kun Yao, Junyu Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document dewarping from a distorted camera-captured image is of great value
+for OCR and document understanding. The document boundary plays an important
+role which is more evident than the inner region in document dewarping. Current
+learning-based methods mainly focus on complete boundary cases, leading to poor
+document correction performance of documents with incomplete boundaries. In
+contrast to these methods, this paper proposes MataDoc, the first method
+focusing on arbitrary boundary document dewarping with margin and text aware
+regularizations. Specifically, we design the margin regularization by
+explicitly considering background consistency to enhance boundary perception.
+Moreover, we introduce word position consistency to keep text lines straight in
+rectified document images. To produce a comprehensive evaluation of MataDoc, we
+propose a novel benchmark ArbDoc, mainly consisting of document images with
+arbitrary boundaries in four typical scenarios. Extensive experiments confirm
+the superiority of MataDoc with consideration for the incomplete boundary on
+ArbDoc and also demonstrate the effectiveness of the proposed method on
+DocUNet, DIR300, and WarpDoc datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpolating between Images with Diffusion Models <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12560v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12560v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clinton J. Wang, Polina Golland
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One little-explored frontier of image generation and editing is the task of
+interpolating between two input images, a feature missing from all currently
+deployed image generation pipelines. We argue that such a feature can expand
+the creative applications of such models, and propose a method for zero-shot
+interpolation using latent diffusion models. We apply interpolation in the
+latent space at a sequence of decreasing noise levels, then perform denoising
+conditioned on interpolated text embeddings derived from textual inversion and
+(optionally) subject poses. For greater consistency, or to specify additional
+criteria, we can generate several candidates and use CLIP to select the highest
+quality image. We obtain convincing interpolations across diverse subject
+poses, image styles, and image content, and show that standard quantitative
+metrics such as FID are insufficient to measure the quality of an
+interpolation. Code and data are available at
+https://clintonjwang.github.io/interpolation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at ICML 2023 Workshop on Challenges of Deploying Generative
+  AI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Event-based Video Frame Interpolation <span class="chip">IROS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12558v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12558v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaben Chen, Yichen Zhu, Dongze Lian, Jiaqi Yang, Yifu Wang, Renrui Zhang, Xinhang Liu, Shenhan Qian, Laurent Kneip, Shenghua Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic vision sensors or event cameras provide rich complementary
+information for video frame interpolation. Existing state-of-the-art methods
+follow the paradigm of combining both synthesis-based and warping networks.
+However, few of those methods fully respect the intrinsic characteristics of
+events streams. Given that event cameras only encode intensity changes and
+polarity rather than color intensities, estimating optical flow from events is
+arguably more difficult than from RGB information. We therefore propose to
+incorporate RGB information in an event-guided optical flow refinement
+strategy. Moreover, in light of the quasi-continuous nature of the time signals
+provided by event cameras, we propose a divide-and-conquer strategy in which
+event-based intermediate frame synthesis happens incrementally in multiple
+simplified stages rather than in a single, long stage. Extensive experiments on
+both synthetic and real-world datasets show that these modifications lead to
+more reliable and realistic intermediate frame results than previous video
+frame interpolation methods. Our findings underline that a careful
+consideration of event characteristics such as high temporal density and
+elevated noise benefits interpolation accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IROS2023 Project Site:
+  https://jiabenchen.github.io/revisit_event</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MFMAN-YOLO: A Method for Detecting Pole-like Obstacles in Complex
+  Environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12548v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12548v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Cai, Hao Wang, Congling Zhou, Yongqiang Wang, Boyu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world traffic, there are various uncertainties and complexities in
+road and weather conditions. To solve the problem that the feature information
+of pole-like obstacles in complex environments is easily lost, resulting in low
+detection accuracy and low real-time performance, a multi-scale hybrid
+attention mechanism detection algorithm is proposed in this paper. First, the
+optimal transport function Monge-Kantorovich (MK) is incorporated not only to
+solve the problem of overlapping multiple prediction frames with optimal
+matching but also the MK function can be regularized to prevent model
+over-fitting; then, the features at different scales are up-sampled separately
+according to the optimized efficient multi-scale feature pyramid. Finally, the
+extraction of multi-scale feature space channel information is enhanced in
+complex environments based on the hybrid attention mechanism, which suppresses
+the irrelevant complex environment background information and focuses the
+feature information of pole-like obstacles. Meanwhile, this paper conducts real
+road test experiments in a variety of complex environments. The experimental
+results show that the detection precision, recall, and average precision of the
+method are 94.7%, 93.1%, and 97.4%, respectively, and the detection frame rate
+is 400 f/s. This research method can detect pole-like obstacles in a complex
+road environment in real time and accurately, which further promotes innovation
+and progress in the field of automatic driving.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Video Anomaly Retrieval from Video Anomaly Detection: New
+  Benchmarks and Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12545v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12545v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Wu, Jing Liu, Xiangteng He, Yuxin Peng, Peng Wang, Yanning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video anomaly detection (VAD) has been paid increasing attention due to its
+potential applications, its current dominant tasks focus on online detecting
+anomalies% at the frame level, which can be roughly interpreted as the binary
+or multiple event classification. However, such a setup that builds
+relationships between complicated anomalous events and single labels, e.g.,
+``vandalism'', is superficial, since single labels are deficient to
+characterize anomalous events. In reality, users tend to search a specific
+video rather than a series of approximate videos. Therefore, retrieving
+anomalous events using detailed descriptions is practical and positive but few
+researches focus on this. In this context, we propose a novel task called Video
+Anomaly Retrieval (VAR), which aims to pragmatically retrieve relevant
+anomalous videos by cross-modalities, e.g., language descriptions and
+synchronous audios. Unlike the current video retrieval where videos are assumed
+to be temporally well-trimmed with short duration, VAR is devised to retrieve
+long untrimmed videos which may be partially relevant to the given query. To
+achieve this, we present two large-scale VAR benchmarks, UCFCrime-AR and
+XDViolence-AR, constructed on top of prevalent anomaly datasets. Meanwhile, we
+design a model called Anomaly-Led Alignment Network (ALAN) for VAR. In ALAN, we
+propose an anomaly-led sampling to focus on key segments in long untrimmed
+videos. Then, we introduce an efficient pretext task to enhance semantic
+associations between video-text fine-grained representations. Besides, we
+leverage two complementary alignments to further match cross-modal contents.
+Experimental results on two benchmarks reveal the challenges of VAR task and
+also demonstrate the advantages of our tailored method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Client-Level Differential Privacy via Adaptive Intermediary in Federated
+  Medical Imaging <span class="chip">MICCAI'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meirui Jiang, Yuan Zhong, Anjie Le, Xiaoxiao Li, Qi Dou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite recent progress in enhancing the privacy of federated learning (FL)
+via differential privacy (DP), the trade-off of DP between privacy protection
+and performance is still underexplored for real-world medical scenario. In this
+paper, we propose to optimize the trade-off under the context of client-level
+DP, which focuses on privacy during communications. However, FL for medical
+imaging involves typically much fewer participants (hospitals) than other
+domains (e.g., mobile devices), thus ensuring clients be differentially private
+is much more challenging. To tackle this problem, we propose an adaptive
+intermediary strategy to improve performance without harming privacy.
+Specifically, we theoretically find splitting clients into sub-clients, which
+serve as intermediaries between hospitals and the server, can mitigate the
+noises introduced by DP without harming privacy. Our proposed approach is
+empirically evaluated on both classification and segmentation tasks using two
+public datasets, and its effectiveness is demonstrated with significant
+performance improvements and comprehensive analytical studies. Code is
+available at: https://github.com/med-air/Client-DP-FL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 26th International Conference on Medical Image Computing
+  and Computer Assisted Intervention (MICCAI'23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SelFormaly: Towards Task-Agnostic Unified Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12540v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12540v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujin Lee, Harin Lim, Hyunsoo Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The core idea of visual anomaly detection is to learn the normality from
+normal images, but previous works have been developed specifically for certain
+tasks, leading to fragmentation among various tasks: defect detection, semantic
+anomaly detection, multi-class anomaly detection, and anomaly clustering. This
+one-task-one-model approach is resource-intensive and incurs high maintenance
+costs as the number of tasks increases. This paper presents SelFormaly, a
+universal and powerful anomaly detection framework. We emphasize the necessity
+of our off-the-shelf approach by pointing out a suboptimal issue with
+fluctuating performance in previous online encoder-based methods. In addition,
+we question the effectiveness of using ConvNets as previously employed in the
+literature and confirm that self-supervised ViTs are suitable for unified
+anomaly detection. We introduce back-patch masking and discover the new role of
+top k-ratio feature matching to achieve unified and powerful anomaly detection.
+Back-patch masking eliminates irrelevant regions that possibly hinder
+target-centric detection with representations of the scene layout. The top
+k-ratio feature matching unifies various anomaly levels and tasks. Finally,
+SelFormaly achieves state-of-the-art results across various datasets for all
+the aforementioned tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generalizable Deepfake Detection by Primary Region
+  Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12534v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12534v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harry Cheng, Yangyang Guo, Tianyi Wang, Liqiang Nie, Mohan Kankanhalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The existing deepfake detection methods have reached a bottleneck in
+generalizing to unseen forgeries and manipulation approaches. Based on the
+observation that the deepfake detectors exhibit a preference for overfitting
+the specific primary regions in input, this paper enhances the generalization
+capability from a novel regularization perspective. This can be simply achieved
+by augmenting the images through primary region removal, thereby preventing the
+detector from over-relying on data bias. Our method consists of two stages,
+namely the static localization for primary region maps, as well as the dynamic
+exploitation of primary region masks. The proposed method can be seamlessly
+integrated into different backbones without affecting their inference
+efficiency. We conduct extensive experiments over three widely used deepfake
+datasets - DFDC, DF-1.0, and Celeb-DF with five backbones. Our method
+demonstrates an average performance improvement of 6% across different
+backbones and performs competitively with several state-of-the-art baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages. Code and Dataset: https://github.com/xaCheng1996/PRLE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Connection between <span class="highlight-title">Pre-train</span>ing Data Diversity and Fine-tuning
+  Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivek Ramanujan, Thao Nguyen, Sewoong Oh, Ludwig Schmidt, Ali Farhadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-training has been widely adopted in deep learning to improve model
+performance, especially when the training data for a target task is limited. In
+our work, we seek to understand the implications of this training strategy on
+the generalization properties of downstream models. More specifically, we ask
+the following question: how do properties of the pre-training distribution
+affect the robustness of a fine-tuned model? The properties we explore include
+the label space, label semantics, image diversity, data domains, and data
+quantity of the pre-training distribution. We find that the primary factor
+influencing downstream effective robustness (Taori et al., 2020) is data
+quantity, while other factors have limited significance. For example, reducing
+the number of ImageNet pre-training classes by 4x while increasing the number
+of images per class by 4x (that is, keeping total data quantity fixed) does not
+impact the robustness of fine-tuned models. We demonstrate our findings on
+pre-training distributions drawn from various natural and synthetic data
+sources, primarily using the iWildCam-WILDS distribution shift as a test for
+downstream robustness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Medical Report Generation: Disease Revealing Enhancement with
+  Knowledge Graph 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixin Wang, Zihao Lin, Haoyu Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Graph (KG) plays a crucial role in Medical Report Generation (MRG)
+because it reveals the relations among diseases and thus can be utilized to
+guide the generation process. However, constructing a comprehensive KG is
+labor-intensive and its applications on the MRG process are under-explored. In
+this study, we establish a complete KG on chest X-ray imaging that includes 137
+types of diseases and abnormalities. Based on this KG, we find that the current
+MRG data sets exhibit a long-tailed problem in disease distribution. To
+mitigate this problem, we introduce a novel augmentation strategy that enhances
+the representation of disease types in the tail-end of the distribution. We
+further design a two-stage MRG approach, where a classifier is first trained to
+detect whether the input images exhibit any abnormalities. The classified
+images are then independently fed into two transformer-based generators,
+namely, ``disease-specific generator" and ``disease-free generator" to generate
+the corresponding reports. To enhance the clinical evaluation of whether the
+generated reports correctly describe the diseases appearing in the input image,
+we propose diverse sensitivity (DS), a new metric that checks whether generated
+diseases match ground truth and measures the diversity of all generated
+diseases. Results show that the proposed two-stage generation framework and
+augmentation strategies improve DS by a considerable margin, indicating a
+notable reduction in the long-tailed problem associated with under-represented
+diseases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Entropy <span class="highlight-title">Transformer</span> Networks: A Learning Approach via Tangent Bundle
+  Data Manifold 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12517v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12517v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pourya Shamsolmoali, Masoumeh Zareapoor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on an accurate and fast interpolation approach for image
+transformation employed in the design of CNN architectures. Standard Spatial
+Transformer Networks (STNs) use bilinear or linear interpolation as their
+interpolation, with unrealistic assumptions about the underlying data
+distributions, which leads to poor performance under scale variations.
+Moreover, STNs do not preserve the norm of gradients in propagation due to
+their dependency on sparse neighboring pixels. To address this problem, a novel
+Entropy STN (ESTN) is proposed that interpolates on the data manifold
+distributions. In particular, random samples are generated for each pixel in
+association with the tangent space of the data manifold and construct a linear
+approximation of their intensity values with an entropy regularizer to compute
+the transformer parameters. A simple yet effective technique is also proposed
+to normalize the non-zero values of the convolution operation, to fine-tune the
+layers for gradients' norm-regularization during training. Experiments on
+challenging benchmarks show that the proposed ESTN can improve predictive
+accuracy over a range of computer vision tasks, including image reconstruction,
+and classification, while reducing the computational cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross Contrastive Feature Perturbation for Domain Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12502v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12502v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenming Li, Daoan Zhang, Wenjian Huang, Jianguo Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain generalization (DG) aims to learn a robust model from source domains
+that generalize well on unseen target domains. Recent studies focus on
+generating novel domain samples or features to diversify distributions
+complementary to source domains. Yet, these approaches can hardly deal with the
+restriction that the samples synthesized from various domains can cause
+semantic distortion. In this paper, we propose an online one-stage Cross
+Contrasting Feature Perturbation (CCFP) framework to simulate domain shift by
+generating perturbed features in the latent space while regularizing the model
+prediction against domain shift. Different from the previous fixed synthesizing
+strategy, we design modules with learnable feature perturbations and semantic
+consistency constraints. In contrast to prior work, our method does not use any
+generative-based models or domain labels. We conduct extensive experiments on a
+standard DomainBed benchmark with a strict evaluation protocol for a fair
+comparison. Comprehensive experiments show that our method outperforms the
+previous state-of-the-art, and quantitative analyses illustrate that our
+approach can alleviate the domain shift problem in out-of-distribution (OOD)
+scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdvDiff: Generating Unrestricted Adversarial Examples using Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuelong Dai, Kaisheng Liang, Bin Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unrestricted adversarial attacks present a serious threat to deep learning
+models and adversarial defense techniques. They pose severe security problems
+for deep learning applications because they can effectively bypass defense
+mechanisms. However, previous attack methods often utilize Generative
+Adversarial Networks (GANs), which are not theoretically provable and thus
+generate unrealistic examples by incorporating adversarial objectives,
+especially for large-scale datasets like ImageNet. In this paper, we propose a
+new method, called AdvDiff, to generate unrestricted adversarial examples with
+diffusion models. We design two novel adversarial guidance techniques to
+conduct adversarial sampling in the reverse generation process of diffusion
+models. These two techniques are effective and stable to generate high-quality,
+realistic adversarial examples by integrating gradients of the target
+classifier interpretably. Experimental results on MNIST and ImageNet datasets
+demonstrate that AdvDiff is effective to generate unrestricted adversarial
+examples, which outperforms GAN-based methods in terms of attack performance
+and generation quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TF-ICON: Diffusion-Based Training-Free Cross-Domain Image Composition <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12493v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12493v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shilin Lu, Yanzhu Liu, Adams Wai-Kin Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-driven diffusion models have exhibited impressive generative
+capabilities, enabling various image editing tasks. In this paper, we propose
+TF-ICON, a novel Training-Free Image COmpositioN framework that harnesses the
+power of text-driven diffusion models for cross-domain image-guided
+composition. This task aims to seamlessly integrate user-provided objects into
+a specific visual context. Current diffusion-based methods often involve costly
+instance-based optimization or finetuning of pretrained models on customized
+datasets, which can potentially undermine their rich prior. In contrast,
+TF-ICON can leverage off-the-shelf diffusion models to perform cross-domain
+image-guided composition without requiring additional training, finetuning, or
+optimization. Moreover, we introduce the exceptional prompt, which contains no
+information, to facilitate text-driven diffusion models in accurately inverting
+real images into latent representations, forming the basis for compositing. Our
+experiments show that equipping Stable Diffusion with the exceptional prompt
+outperforms state-of-the-art inversion methods on various datasets (CelebA-HQ,
+COCO, and ImageNet), and that TF-ICON surpasses prior baselines in versatile
+visual domains. Code is available at https://github.com/Shilin-LU/TF-ICON
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Data Distillation: Do Not Overlook Calibration <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyao Zhu, Bowen Lei, Jie Zhang, Yanbo Fang, Ruqi Zhang, Yiqun Xie, Dongkuan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks trained on distilled data often produce over-confident output
+and require correction by calibration methods. Existing calibration methods
+such as temperature scaling and mixup work well for networks trained on
+original large-scale data. However, we find that these methods fail to
+calibrate networks trained on data distilled from large source datasets. In
+this paper, we show that distilled data lead to networks that are not
+calibratable due to (i) a more concentrated distribution of the maximum logits
+and (ii) the loss of information that is semantically meaningful but unrelated
+to classification tasks. To address this problem, we propose Masked Temperature
+Scaling (MTS) and Masked Distillation Training (MDT) which mitigate the
+limitations of distilled data and achieve better calibration results while
+maintaining the efficiency of dataset distillation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust face anti-spoofing framework with Convolutional Vision
+  <span class="highlight-title">Transformer</span> <span class="chip">ICIP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12459v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12459v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunseung Lee, Youngjun Kwak, Jinho Shin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Owing to the advances in image processing technology and large-scale
+datasets, companies have implemented facial authentication processes, thereby
+stimulating increased focus on face anti-spoofing (FAS) against realistic
+presentation attacks. Recently, various attempts have been made to improve face
+recognition performance using both global and local learning on face images;
+however, to the best of our knowledge, this is the first study to investigate
+whether the robustness of FAS against domain shifts is improved by considering
+global information and local cues in face images captured using self-attention
+and convolutional layers. This study proposes a convolutional vision
+transformer-based framework that achieves robust performance for various unseen
+domain data. Our model resulted in 7.3%$p$ and 12.9%$p$ increases in FAS
+performance compared to models using only a convolutional neural network or
+vision transformer, respectively. It also shows the highest average rank in
+sub-protocols of cross-dataset setting over the other nine benchmark models for
+domain generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICIP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Does Progress On Object Recognition Benchmarks Improve Real-World
+  Generalization? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Megan Richards, Polina Kirichenko, Diane Bouchacourt, Mark Ibrahim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For more than a decade, researchers have measured progress in object
+recognition on ImageNet-based generalization benchmarks such as ImageNet-A, -C,
+and -R. Recent advances in foundation models, trained on orders of magnitude
+more data, have begun to saturate these standard benchmarks, but remain brittle
+in practice. This suggests standard benchmarks, which tend to focus on
+predefined or synthetic changes, may not be sufficient for measuring real world
+generalization. Consequently, we propose studying generalization across
+geography as a more realistic measure of progress using two datasets of objects
+from households across the globe. We conduct an extensive empirical evaluation
+of progress across nearly 100 vision models up to most recent foundation
+models. We first identify a progress gap between standard benchmarks and
+real-world, geographical shifts: progress on ImageNet results in up to 2.5x
+more progress on standard generalization benchmarks than real-world
+distribution shifts. Second, we study model generalization across geographies
+by measuring the disparities in performance across regions, a more fine-grained
+measure of real world generalization. We observe all models have large
+geographic disparities, even foundation CLIP models, with differences of 7-20%
+in accuracy between regions. Counter to modern intuition, we discover progress
+on standard benchmarks fails to improve geographic disparities and often
+exacerbates them: geographic disparities between the least performant models
+and today's best models have more than tripled. Our results suggest scaling
+alone is insufficient for consistent robustness to real-world distribution
+shifts. Finally, we highlight in early experiments how simple last layer
+retraining on more representative, curated data can complement scaling as a
+promising direction of future work, reducing geographic disparity on both
+benchmarks by over two-thirds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ simPLE: a visuotactile method learned in simulation to precisely pick,
+  localize, regrasp, and place objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13133v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13133v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maria Bauza, Antonia Bronars, Yifan Hou, Ian Taylor, Nikhil Chavan-Dafle, Alberto Rodriguez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing robotic systems have a clear tension between generality and
+precision. Deployed solutions for robotic manipulation tend to fall into the
+paradigm of one robot solving a single task, lacking precise generalization,
+i.e., the ability to solve many tasks without compromising on precision. This
+paper explores solutions for precise and general pick-and-place. In precise
+pick-and-place, i.e. kitting, the robot transforms an unstructured arrangement
+of objects into an organized arrangement, which can facilitate further
+manipulation. We propose simPLE (simulation to Pick Localize and PLacE) as a
+solution to precise pick-and-place. simPLE learns to pick, regrasp and place
+objects precisely, given only the object CAD model and no prior experience. We
+develop three main components: task-aware grasping, visuotactile perception,
+and regrasp planning. Task-aware grasping computes affordances of grasps that
+are stable, observable, and favorable to placing. The visuotactile perception
+model relies on matching real observations against a set of simulated ones
+through supervised learning. Finally, we compute the desired robot motion by
+solving a shortest path problem on a graph of hand-to-hand regrasps. On a
+dual-arm robot equipped with visuotactile sensing, we demonstrate
+pick-and-place of 15 diverse objects with simPLE. The objects span a wide range
+of shapes and simPLE achieves successful placements into structured
+arrangements with 1mm clearance over 90% of the time for 6 objects, and over
+80% of the time for 11 objects. Videos are available at
+http://mcube.mit.edu/research/simPLE.html .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 6 figures, 2 tables, submitted to Science Robotics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning Approaches for Data Augmentation in Medical Imaging: A
+  <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13125v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13125v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aghiles Kebaili, Jérôme Lapuyade-Lahorgue, Su Ruan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has become a popular tool for medical image analysis, but the
+limited availability of training data remains a major challenge, particularly
+in the medical field where data acquisition can be costly and subject to
+privacy regulations. Data augmentation techniques offer a solution by
+artificially increasing the number of training samples, but these techniques
+often produce limited and unconvincing results. To address this issue, a
+growing number of studies have proposed the use of deep generative models to
+generate more realistic and diverse data that conform to the true distribution
+of the data. In this review, we focus on three types of deep generative models
+for medical image augmentation: variational autoencoders, generative
+adversarial networks, and diffusion models. We provide an overview of the
+current state of the art in each of these models and discuss their potential
+for use in different downstream tasks in medical imaging, including
+classification, segmentation, and cross-modal translation. We also evaluate the
+strengths and limitations of each model and suggest directions for future
+research in this field. Our goal is to provide a comprehensive review about the
+use of deep generative models for medical image augmentation and to highlight
+the potential of these models for improving the performance of deep learning
+algorithms in medical image analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Infant Respiration Estimation from Video: A Deep Flow-based
+  Algorithm and a Novel Public Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13110v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13110v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sai Kumar Reddy Manne, Shaotong Zhu, Sarah Ostadabbas, Michael Wan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Respiration is a critical vital sign for infants, and continuous respiratory
+monitoring is particularly important for newborns. However, neonates are
+sensitive and contact-based sensors present challenges in comfort, hygiene, and
+skin health, especially for preterm babies. As a step toward fully automatic,
+continuous, and contactless respiratory monitoring, we develop a deep-learning
+method for estimating respiratory rate and waveform from plain video footage in
+natural settings. Our automated infant respiration flow-based network
+(AIRFlowNet) combines video-extracted optical flow input and spatiotemporal
+convolutional processing tuned to the infant domain. We support our model with
+the first public annotated infant respiration dataset with 125 videos
+(AIR-125), drawn from eight infant subjects, set varied pose, lighting, and
+camera conditions. We include manual respiration annotations and optimize
+AIRFlowNet training on them using a novel spectral bandpass loss function. When
+trained and tested on the AIR-125 infant data, our method significantly
+outperforms other state-of-the-art methods in respiratory rate estimation,
+achieving a mean absolute error of $\sim$2.9 breaths per minute, compared to
+$\sim$4.7--6.2 for other public models designed for adult subjects and more
+uniform environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Certified Training: Towards Better Accuracy-Robustness
+  Tradeoffs <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhakshylyk Nurlanov, Frank R. Schmidt, Florian Bernard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As deep learning models continue to advance and are increasingly utilized in
+real-world systems, the issue of robustness remains a major challenge. Existing
+certified training methods produce models that achieve high provable robustness
+guarantees at certain perturbation levels. However, the main problem of such
+models is a dramatically low standard accuracy, i.e. accuracy on clean
+unperturbed data, that makes them impractical. In this work, we consider a more
+realistic perspective of maximizing the robustness of a model at certain levels
+of (high) standard accuracy. To this end, we propose a novel certified training
+method based on a key insight that training with adaptive certified radii helps
+to improve both the accuracy and robustness of the model, advancing
+state-of-the-art accuracy-robustness tradeoffs. We demonstrate the
+effectiveness of the proposed method on MNIST, CIFAR-10, and TinyImageNet
+datasets. Particularly, on CIFAR-10 and TinyImageNet, our method yields models
+with up to two times higher robustness, measured as an average certified radius
+of a test set, at the same levels of standard accuracy compared to baseline
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at ICML 2023 workshop "New Frontiers in Adversarial Machine
+  Learning"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ General-Purpose Multi-Modal OOD Detection Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viet Duong, Qiong Wu, Zhengyi Zhou, Eric Zavesky, Jiahe Chen, Xiangzhou Liu, Wen-Ling Hsu, Huajie Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection identifies test samples that differ from
+the training data, which is critical to ensuring the safety and reliability of
+machine learning (ML) systems. While a plethora of methods have been developed
+to detect uni-modal OOD samples, only a few have focused on multi-modal OOD
+detection. Current contrastive learning-based methods primarily study
+multi-modal OOD detection in a scenario where both a given image and its
+corresponding textual description come from a new domain. However, real-world
+deployments of ML systems may face more anomaly scenarios caused by multiple
+factors like sensor faults, bad weather, and environmental changes. Hence, the
+goal of this work is to simultaneously detect from multiple different OOD
+scenarios in a fine-grained manner. To reach this goal, we propose a
+general-purpose weakly-supervised OOD detection framework, called WOOD, that
+combines a binary classifier and a contrastive learning component to reap the
+benefits of both. In order to better distinguish the latent representations of
+in-distribution (ID) and OOD samples, we adopt the Hinge loss to constrain
+their similarity. Furthermore, we develop a new scoring metric to integrate the
+prediction results from both the binary classifier and contrastive learning for
+identifying OOD samples. We evaluate the proposed WOOD model on multiple
+real-world datasets, and the experimental results demonstrate that the WOOD
+model outperforms the state-of-the-art methods for multi-modal OOD detection.
+Importantly, our approach is able to achieve high accuracy in OOD detection in
+three different OOD scenarios simultaneously. The source code will be made
+publicly available upon publication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the characteristics of natural hydraulic dampers: An image-based
+  approach to study the fluid flow behaviour inside the human meniscal tissue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13060v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13060v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        J. Waghorne, F. P. Bonomo, A. Rabbani, D. Bell, O. Barrera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The meniscal tissue is a layered material with varying properties influenced
+by collagen content and arrangement. Understanding the relationship between
+structure and properties is crucial for disease management, treatment
+development, and biomaterial design. The internal layer of the meniscus is
+softer and more deformable than the outer layers, thanks to interconnected
+collagen channels that guide fluid flow. To investigate these relationships, we
+propose a novel approach that combines Computational Fluid Dynamics (CFD) with
+Image Analysis (CFD-IA). We analyze fluid flow in the internal architecture of
+the human meniscus across a range of inlet velocities (0.1mm/s to 1.6m/s) using
+high-resolution 3D micro-computed tomography scans. Statistical correlations
+are observed between architectural parameters (tortuosity, connectivity,
+porosity, pore size) and fluid flow parameters (Re number distribution,
+permeability). Some channels exhibit Re values of 1400 at an inlet velocity of
+1.6m/s, and a transition from Darcy's regime to a non-Darcian regime occurs
+around an inlet velocity of 0.02m/s. Location-dependent permeability ranges
+from 20-32 Darcy. Regression modelling reveals a strong correlation between
+fluid velocity and tortuosity at high inlet velocities, as well as with channel
+diameter at low inlet velocities. At higher inlet velocities, flow paths
+deviate more from the preferential direction, resulting in a decrease in the
+concentration parameter by an average of 0.4. This research provides valuable
+insights into the fluid flow behaviour within the meniscus and its structural
+influences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 Pages, 5 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Maximal Independent Sets for Pooling in Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stevan Stanovic, Benoit Gaüzère, Luc Brun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional Neural Networks (CNNs) have enabled major advances in image
+classification through convolution and pooling. In particular, image pooling
+transforms a connected discrete lattice into a reduced lattice with the same
+connectivity and allows reduction functions to consider all pixels in an image.
+However, there is no pooling that satisfies these properties for graphs. In
+fact, traditional graph pooling methods suffer from at least one of the
+following drawbacks: Graph disconnection or overconnection, low decimation
+ratio, and deletion of large parts of graphs. In this paper, we present three
+pooling methods based on the notion of maximal independent sets that avoid
+these pitfalls. Our experimental results confirm the relevance of maximal
+independent set constraints for graph pooling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Segmenting Known Objects and Unseen Unknowns without Prior Knowledge <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05407v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05407v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefano Gasperini, Alvaro Marcos-Ramiro, Michael Schmidt, Nassir Navab, Benjamin Busam, Federico Tombari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Panoptic segmentation methods assign a known class to each pixel given in
+input. Even for state-of-the-art approaches, this inevitably enforces decisions
+that systematically lead to wrong predictions for objects outside the training
+categories. However, robustness against out-of-distribution samples and corner
+cases is crucial in safety-critical settings to avoid dangerous consequences.
+Since real-world datasets cannot contain enough data points to adequately
+sample the long tail of the underlying distribution, models must be able to
+deal with unseen and unknown scenarios as well. Previous methods targeted this
+by re-identifying already-seen unlabeled objects. In this work, we propose the
+necessary step to extend segmentation with a new setting which we term holistic
+segmentation. Holistic segmentation aims to identify and separate objects of
+unseen unknown categories into instances, without any prior knowledge about
+them, while performing panoptic segmentation of known classes. We tackle this
+new problem with U3HS, which finds unknowns as highly uncertain regions and
+clusters their corresponding instance-aware embeddings into individual objects.
+By doing so, for the first time in panoptic segmentation with unknown objects,
+our U3HS is trained without unknown categories, reducing assumptions and
+leaving the settings as unconstrained as in real-life scenarios. Extensive
+experiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate
+the effectiveness of U3HS for this new, challenging, and assumptions-free
+setting called holistic segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizing similarity in noisy setups: the DIBS phenomenon <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.12803v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.12803v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nayara Fonseca, Veronica Guidetti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work uncovers an interplay among data density, noise, and the
+generalization ability in similarity learning. We consider Siamese Neural
+Networks (SNNs), which are the basic form of contrastive learning, and explore
+two types of noise that can impact SNNs, Pair Label Noise (PLN) and Single
+Label Noise (SLN). Our investigation reveals that SNNs exhibit double descent
+behaviour regardless of the training setup and that it is further exacerbated
+by noise. We demonstrate that the density of data pairs is crucial for
+generalization. When SNNs are trained on sparse datasets with the same amount
+of PLN or SLN, they exhibit comparable generalization properties. However, when
+using dense datasets, PLN cases generalize worse than SLN ones in the
+overparametrized region, leading to a phenomenon we call Density-Induced Break
+of Similarity (DIBS). In this regime, PLN similarity violation becomes
+macroscopical, corrupting the dataset to the point where complete interpolation
+cannot be achieved, regardless of the number of model parameters. Our analysis
+also delves into the correspondence between online optimization and offline
+generalization in similarity learning. The results show that this equivalence
+fails in the presence of label noise in all the scenarios considered.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v3: version accepted at ECAI 2023 + Supplementary Material</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Encyclopedic VQA: Visual questions about detailed properties of
+  fine-grained categories <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09224v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09224v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Mensink, Jasper Uijlings, Lluis Castrejon, Arushi Goel, Felipe Cadar, Howard Zhou, Fei Sha, André Araujo, Vittorio Ferrari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Encyclopedic-VQA, a large scale visual question answering (VQA)
+dataset featuring visual questions about detailed properties of fine-grained
+categories and instances. It contains 221k unique question+answer pairs each
+matched with (up to) 5 images, resulting in a total of 1M VQA samples.
+Moreover, our dataset comes with a controlled knowledge base derived from
+Wikipedia, marking the evidence to support each answer. Empirically, we show
+that our dataset poses a hard challenge for large vision+language models as
+they perform poorly on our dataset: PaLI [14] is state-of-the-art on OK-VQA
+[37], yet it only achieves 13.0% accuracy on our dataset. Moreover, we
+experimentally show that progress on answering our encyclopedic questions can
+be achieved by augmenting large models with a mechanism that retrieves relevant
+information from the knowledge base. An oracle experiment with perfect
+retrieval achieves 87.0% accuracy on the single-hop portion of our dataset, and
+an automatic retrieval-augmented prototype yields 48.8%. We believe that our
+dataset enables future research on retrieval-augmented vision+language models.
+It is available at
+https://github.com/google-research/google-research/tree/master/encyclopedic_vqa .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BoxSnake: Polygonal Instance Segmentation with Box Supervision <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11630v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11630v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Yang, Lin Song, Yixiao Ge, Xiu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Box-supervised instance segmentation has gained much attention as it requires
+only simple box annotations instead of costly mask or polygon annotations.
+However, existing box-supervised instance segmentation models mainly focus on
+mask-based frameworks. We propose a new end-to-end training technique, termed
+BoxSnake, to achieve effective polygonal instance segmentation using only box
+annotations for the first time. Our method consists of two loss functions: (1)
+a point-based unary loss that constrains the bounding box of predicted polygons
+to achieve coarse-grained segmentation; and (2) a distance-aware pairwise loss
+that encourages the predicted polygons to fit the object boundaries. Compared
+with the mask-based weakly-supervised methods, BoxSnake further reduces the
+performance gap between the predicted segmentation and the bounding box, and
+shows significant superiority on the Cityscapes dataset. The code has been
+available publicly.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Coupling a Recurrent Neural Network to SPAD TCSPC Systems for Real-time
+  Fluorescence Lifetime Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15599v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15599v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Lin, Paul Mos, Andrei Ardelean, Claudio Bruschini, Edoardo Charbon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fluorescence lifetime imaging (FLI) has been receiving increased attention in
+recent years as a powerful diagnostic technique in biological and medical
+research. However, existing FLI systems often suffer from a tradeoff between
+processing speed, accuracy, and robustness. In this paper, we propose a robust
+approach that enables fast FLI with no degradation of accuracy. The approach is
+based on a SPAD TCSPC system coupled to a recurrent neural network (RNN) that
+accurately estimates the fluorescence lifetime directly from raw timestamps
+without building histograms, thereby drastically reducing transfer data volumes
+and hardware resource utilization, thus enabling FLI acquisition at video rate.
+We train two variants of the RNN on a synthetic dataset and compare the results
+to those obtained using center-of-mass method (CMM) and least squares fitting
+(LS fitting). Results demonstrate that two RNN variants, gated recurrent unit
+(GRU) and long short-term memory (LSTM), are comparable to CMM and LS fitting
+in terms of accuracy, while outperforming them in background noise by a large
+margin. To explore the ultimate limits of the approach, we derived the
+Cramer-Rao lower bound of the measurement, showing that RNN yields lifetime
+estimations with near-optimal precision. Moreover, our FLI model, which is
+purely trained on synthetic datasets, works well with never-seen-before,
+real-world data. To demonstrate real-time operation, we have built a FLI
+microscope based on Piccolo, a 32x32 SPAD sensor developed in our lab. Four
+quantized GRU cores, capable of processing up to 4 million photons per second,
+are deployed on a Xilinx Kintex-7 FPGA. Powered by the GRU, the FLI setup can
+retrieve real-time fluorescence lifetime images at up to 10 frames per second.
+The proposed FLI system is promising and ideally suited for biomedical
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Saner Deep Image Registration <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09696v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09696v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Duan, Ming Zhong, Yan Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With recent advances in computing hardware and surges of deep-learning
+architectures, learning-based deep image registration methods have surpassed
+their traditional counterparts, in terms of metric performance and inference
+time. However, these methods focus on improving performance measurements such
+as Dice, resulting in less attention given to model behaviors that are equally
+desirable for registrations, especially for medical imaging. This paper
+investigates these behaviors for popular learning-based deep registrations
+under a sanity-checking microscope. We find that most existing registrations
+suffer from low inverse consistency and nondiscrimination of identical pairs
+due to overly optimized image similarities. To rectify these behaviors, we
+propose a novel regularization-based sanity-enforcer method that imposes two
+sanity checks on the deep model to reduce its inverse consistency errors and
+increase its discriminative power simultaneously. Moreover, we derive a set of
+theoretical guarantees for our sanity-checked image registration method, with
+experimental results supporting our theoretical findings and their
+effectiveness in increasing the sanity of models without sacrificing any
+performance. Our code and models are available at
+https://github.com/tuffr5/Saner-deep-registration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compound Attention and Neighbor Matching Network for Multi-contrast MRI
+  Super-resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02148v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02148v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxuan Chen, Sirui Wu, Shuai Wang, Zhongsen Li, Jia Yang, Huifeng Yao, Xiaomeng Li, Xiaolei Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-contrast magnetic resonance imaging (MRI) reflects information about
+human tissue from different perspectives and has many clinical applications. By
+utilizing the complementary information among different modalities,
+multi-contrast super-resolution (SR) of MRI can achieve better results than
+single-image super-resolution. However, existing methods of multi-contrast MRI
+SR have the following shortcomings that may limit their performance: First,
+existing methods either simply concatenate the reference and degraded features
+or exploit global feature-matching between them, which are unsuitable for
+multi-contrast MRI SR. Second, although many recent methods employ transformers
+to capture long-range dependencies in the spatial dimension, they neglect that
+self-attention in the channel dimension is also important for low-level vision
+tasks. To address these shortcomings, we proposed a novel network architecture
+with compound-attention and neighbor matching (CANM-Net) for multi-contrast MRI
+SR: The compound self-attention mechanism effectively captures the dependencies
+in both spatial and channel dimension; the neighborhood-based feature-matching
+modules are exploited to match degraded features and adjacent reference
+features and then fuse them to obtain the high-quality images. We conduct
+experiments of SR tasks on the IXI, fastMRI, and real-world scanning datasets.
+The CANM-Net outperforms state-of-the-art approaches in both retrospective and
+prospective experiments. Moreover, the robustness study in our work shows that
+the CANM-Net still achieves good performance when the reference and degraded
+images are imperfectly registered, proving good potential in clinical
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Cross-Modal Retrieval with Set of Diverse Embeddings <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.16761v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.16761v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongwon Kim, Namyup Kim, Suha Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-modal retrieval across image and text modalities is a challenging task
+due to its inherent ambiguity: An image often exhibits various situations, and
+a caption can be coupled with diverse images. Set-based embedding has been
+studied as a solution to this problem. It seeks to encode a sample into a set
+of different embedding vectors that capture different semantics of the sample.
+In this paper, we present a novel set-based embedding method, which is distinct
+from previous work in two aspects. First, we present a new similarity function
+called smooth-Chamfer similarity, which is designed to alleviate the side
+effects of existing similarity functions for set-based embedding. Second, we
+propose a novel set prediction module to produce a set of embedding vectors
+that effectively captures diverse semantics of input by the slot attention
+mechanism. Our method is evaluated on the COCO and Flickr30K datasets across
+different visual backbones, where it outperforms existing methods including
+ones that demand substantially larger computation at inference.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR 2023 (Highlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdaBest: Minimizing Client Drift in Federated Learning via Adaptive Bias
+  Estimation <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.13170v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.13170v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farshid Varno, Marzie Saghayi, Laya Rafiee Sevyeri, Sharut Gupta, Stan Matwin, Mohammad Havaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Federated Learning (FL), a number of clients or devices collaborate to
+train a model without sharing their data. Models are optimized locally at each
+client and further communicated to a central hub for aggregation. While FL is
+an appealing decentralized training paradigm, heterogeneity among data from
+different clients can cause the local optimization to drift away from the
+global objective. In order to estimate and therefore remove this drift,
+variance reduction techniques have been incorporated into FL optimization
+recently. However, these approaches inaccurately estimate the clients' drift
+and ultimately fail to remove it properly. In this work, we propose an adaptive
+algorithm that accurately estimates drift across clients. In comparison to
+previous works, our approach necessitates less storage and communication
+bandwidth, as well as lower compute costs. Additionally, our proposed
+methodology induces stability by constraining the norm of estimates for client
+drift, making it more practical for large scale FL. Experimental findings
+demonstrate that the proposed algorithm converges significantly faster and
+achieves higher accuracy than the baselines across various FL benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at ECCV 2022; Corrected some typos in
+  the text and a baseline algorithm</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deployment of Image Analysis Algorithms under Prevalence Shifts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12540v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12540v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Godau, Piotr Kalinowski, Evangelia Christodoulou, Annika Reinke, Minu Tizabi, Luciana Ferrer, Paul Jäger, Lena Maier-Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain gaps are among the most relevant roadblocks in the clinical
+translation of machine learning (ML)-based solutions for medical image
+analysis. While current research focuses on new training paradigms and network
+architectures, little attention is given to the specific effect of prevalence
+shifts on an algorithm deployed in practice. Such discrepancies between class
+frequencies in the data used for a method's development/validation and that in
+its deployment environment(s) are of great importance, for example in the
+context of artificial intelligence (AI) democratization, as disease prevalences
+may vary widely across time and location. Our contribution is twofold. First,
+we empirically demonstrate the potentially severe consequences of missing
+prevalence handling by analyzing (i) the extent of miscalibration, (ii) the
+deviation of the decision threshold from the optimum, and (iii) the ability of
+validation metrics to reflect neural network performance on the deployment
+population as a function of the discrepancy between development and deployment
+prevalence. Second, we propose a workflow for prevalence-aware image
+classification that uses estimated deployment prevalences to adjust a trained
+classifier to a new environment, without requiring additional annotated
+deployment data. Comprehensive experiments based on a diverse set of 30 medical
+classification tasks showcase the benefit of the proposed workflow in
+generating better classifier decisions and more reliable performance estimates
+compared to current practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning-based Anonymization of Chest Radiographs: A
+  Utility-preserving Measure for Patient Privacy <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.11531v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.11531v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Packhäuser, Sebastian Gündel, Florian Thamm, Felix Denzinger, Andreas Maier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust and reliable anonymization of chest radiographs constitutes an
+essential step before publishing large datasets of such for research purposes.
+The conventional anonymization process is carried out by obscuring personal
+information in the images with black boxes and removing or replacing
+meta-information. However, such simple measures retain biometric information in
+the chest radiographs, allowing patients to be re-identified by a linkage
+attack. Therefore, there is an urgent need to obfuscate the biometric
+information appearing in the images. We propose the first deep learning-based
+approach (PriCheXy-Net) to targetedly anonymize chest radiographs while
+maintaining data utility for diagnostic and machine learning purposes. Our
+model architecture is a composition of three independent neural networks that,
+when collectively used, allow for learning a deformation field that is able to
+impede patient re-identification. Quantitative results on the ChestX-ray14
+dataset show a reduction of patient re-identification from 81.8% to 57.7% (AUC)
+after re-training with little impact on the abnormality classification
+performance. This indicates the ability to preserve underlying abnormality
+patterns while increasing patient privacy. Lastly, we compare our proposed
+anonymization approach with two other obfuscation-based methods (Privacy-Net,
+DP-Pix) and demonstrate the superiority of our method towards resolving the
+privacy-utility trade-off for chest radiographs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizable Embeddings with Cross-batch Metric Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07620v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07620v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeti Z. Gurbuz, A. Aydin Alatan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Global average pooling (GAP) is a popular component in deep metric learning
+(DML) for aggregating features. Its effectiveness is often attributed to
+treating each feature vector as a distinct semantic entity and GAP as a
+combination of them. Albeit substantiated, such an explanation's algorithmic
+implications to learn generalizable entities to represent unseen classes, a
+crucial DML goal, remain unclear. To address this, we formulate GAP as a convex
+combination of learnable prototypes. We then show that the prototype learning
+can be expressed as a recursive process fitting a linear predictor to a batch
+of samples. Building on that perspective, we consider two batches of disjoint
+classes at each iteration and regularize the learning by expressing the samples
+of a batch with the prototypes that are fitted to the other batch. We validate
+our approach on 4 popular DML benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>\c{opyright} 2023 IEEE. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses, in any current or
+  future media, including reprinting/republishing this material for advertising
+  or promotional purposes, creating new collective works, for resale or
+  redistribution to servers or lists, or reuse of any copyrighted component of
+  this work in other works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BiofilmScanner: A Computational Intelligence Approach to Obtain
+  Bacterial Cell Morphological Attributes from Biofilm Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09629v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09629v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Hafizur Rahman, Md Ali Azam, Md Abir Hossen, Shankarachary Ragi, Venkataramana Gadhamshetty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Desulfovibrio alaskensis G20 (DA-G20) is utilized as a model for
+sulfate-reducing bacteria (SRB) that are associated with corrosion issues
+caused by microorganisms. SRB-based biofilms are thought to be responsible for
+the billion-dollar-per-year bio-corrosion of metal infrastructure.
+Understanding the extraction of the bacterial cells' shape and size properties
+in the SRB-biofilm at different growth stages will assist with the design of
+anti-corrosion techniques. However, numerous issues affect current approaches,
+including time-consuming geometric property extraction, low efficiency, and
+high error rates. This paper proposes BiofilScanner, a Yolact-based deep
+learning method integrated with invariant moments to address these problems.
+Our approach efficiently detects and segments bacterial cells in an SRB image
+while simultaneously invariant moments measure the geometric characteristics of
+the segmented cells with low errors. The numerical experiments of the proposed
+method demonstrate that the BiofilmScanner is 2.1x and 6.8x faster than our
+earlier Mask-RCNN and DLv3+ methods for detecting, segmenting, and measuring
+the geometric properties of the cell. Furthermore, the BiofilmScanner achieved
+an F1-score of 85.28% while Mask-RCNN and DLv3+ obtained F1-scores of 77.67%
+and 75.18%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Pattern Recognition</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeRF-GAN Distillation for Efficient 3D-Aware Generation with
+  Convolutions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12865v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12865v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamad Shahbazi, Evangelos Ntavelis, Alessio Tonioni, Edo Collins, Danda Pani Paudel, Martin Danelljan, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pose-conditioned convolutional generative models struggle with high-quality
+3D-consistent image generation from single-view datasets, due to their lack of
+sufficient 3D priors. Recently, the integration of Neural Radiance Fields
+(NeRFs) and generative models, such as Generative Adversarial Networks (GANs),
+has transformed 3D-aware generation from single-view images. NeRF-GANs exploit
+the strong inductive bias of neural 3D representations and volumetric rendering
+at the cost of higher computational complexity. This study aims at revisiting
+pose-conditioned 2D GANs for efficient 3D-aware generation at inference time by
+distilling 3D knowledge from pretrained NeRF-GANs. We propose a simple and
+effective method, based on re-using the well-disentangled latent space of a
+pre-trained NeRF-GAN in a pose-conditioned convolutional network to directly
+generate 3D-consistent images corresponding to the underlying 3D
+representations. Experiments on several datasets demonstrate that the proposed
+method obtains results comparable with volumetric rendering in terms of quality
+and 3D consistency while benefiting from the computational advantage of
+convolutional networks. The code will be available at:
+https://github.com/mshahbazi72/NeRF-GAN-Distillation
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Automated Hemorrhage Detection in Sparse-view Computed
+  Tomography via Deep Convolutional Neural Network based Artifact Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09340v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09340v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johannes Thalhammer, Manuel Schultheiss, Tina Dorosti, Tobias Lasser, Franz Pfeiffer, Daniela Pfeiffer, Florian Schaff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Sparse-view computed tomography (CT) is an effective way to reduce
+dose by lowering the total number of views acquired, albeit at the expense of
+image quality, which, in turn, can impact the ability to detect diseases. We
+explore deep learning-based artifact reduction in sparse-view cranial CT scans
+and its impact on automated hemorrhage detection. Methods: We trained a U-Net
+for artefact reduction on simulated sparse-view cranial CT scans from 3000
+patients obtained from a public dataset and reconstructed with varying levels
+of sub-sampling. Additionally, we trained a convolutional neural network on
+fully sampled CT data from 17,545 patients for automated hemorrhage detection.
+We evaluated the classification performance using the area under the receiver
+operator characteristic curves (AUC-ROCs) with corresponding 95% confidence
+intervals (CIs) and the DeLong test, along with confusion matrices. The
+performance of the U-Net was compared to an analytical approach based on total
+variation (TV). Results: The U-Net performed superior compared to unprocessed
+and TV-processed images with respect to image quality and automated hemorrhage
+diagnosis. With U-Net post-processing, the number of views can be reduced from
+4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;
+0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256
+views (0.967; 0.964-0.969) with a slight performance decrease (P<.001).
+Conclusion: The results suggest that U-Net based artifact reduction
+substantially enhances automated hemorrhage detection in sparse-view cranial
+CTs. Our findings highlight that appropriate post-processing is crucial for
+optimal image quality and diagnostic accuracy while minimizing radiation dose.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UP-DETR: Unsupervised <span class="highlight-title">Pre-train</span>ing for Object Detection with
+  <span class="highlight-title">Transformer</span>s <span class="chip">CVPR 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2011.09094v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2011.09094v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhigang Dai, Bolun Cai, Yugeng Lin, Junying Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DEtection TRansformer (DETR) for object detection reaches competitive
+performance compared with Faster R-CNN via a transformer encoder-decoder
+architecture. However, trained with scratch transformers, DETR needs
+large-scale training data and an extreme long training schedule even on COCO
+dataset. Inspired by the great success of pre-training transformers in natural
+language processing, we propose a novel pretext task named random query patch
+detection in Unsupervised Pre-training DETR (UP-DETR). Specifically, we
+randomly crop patches from the given image and then feed them as queries to the
+decoder. The model is pre-trained to detect these query patches from the input
+image. During the pre-training, we address two critical issues: multi-task
+learning and multi-query localization. (1) To trade off classification and
+localization preferences in the pretext task, we find that freezing the CNN
+backbone is the prerequisite for the success of pre-training transformers. (2)
+To perform multi-query localization, we develop UP-DETR with multi-query patch
+detection with attention mask. Besides, UP-DETR also provides a unified
+perspective for fine-tuning object detection and one-shot detection tasks. In
+our experiments, UP-DETR significantly boosts the performance of DETR with
+faster convergence and higher average precision on object detection, one-shot
+detection and panoptic segmentation. Code and pre-training models:
+https://github.com/dddzg/up-detr.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TPAMI 2022 and CVPR 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contrastive Learning and the Emergence of Attributes Associations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.10763v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.10763v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel N. Nissani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In response to an object presentation, supervised learning schemes generally
+respond with a parsimonious label. Upon a similar presentation we humans
+respond again with a label, but are flooded, in addition, by a myriad of
+associations. A significant portion of these consist of the presented object
+attributes. Contrastive learning is a semi-supervised learning scheme based on
+the application of identity preserving transformations on the object input
+representations. It is conjectured in this work that these same applied
+transformations preserve, in addition to the identity of the presented object,
+also the identity of its semantically meaningful attributes. The corollary of
+this is that the output representations of such a contrastive learning scheme
+contain valuable information not only for the classification of the presented
+object, but also for the presence or absence decision of any attribute of
+interest. Simulation results which demonstrate this idea and the feasibility of
+this conjecture are presented.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dr. KID: Direct Remeshing and K-set Isometric Decomposition for Scalable
+  Physicalization of Organic Shapes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02941v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02941v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dawar Khan, Ciril Bohak, Ivan Viola
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dr. KID is an algorithm that uses isometric decomposition for the
+physicalization of potato-shaped organic models in a puzzle fashion. The
+algorithm begins with creating a simple, regular triangular surface mesh of
+organic shapes, followed by iterative k-means clustering and remeshing. For
+clustering, we need similarity between triangles (segments) which is defined as
+a distance function. The distance function maps each triangle's shape to a
+single point in the virtual 3D space. Thus, the distance between the triangles
+indicates their degree of dissimilarity. K-means clustering uses this distance
+and sorts of segments into k classes. After this, remeshing is applied to
+minimize the distance between triangles within the same cluster by making their
+shapes identical. Clustering and remeshing are repeated until the distance
+between triangles in the same cluster reaches an acceptable threshold. We adopt
+a curvature-aware strategy to determine the surface thickness and finalize
+puzzle pieces for 3D printing. Identical hinges and holes are created for
+assembling the puzzle components. For smoother outcomes, we use triangle
+subdivision along with curvature-aware clustering, generating curved triangular
+patches for 3D printing. Our algorithm was evaluated using various models, and
+the 3D-printed results were analyzed. Findings indicate that our algorithm
+performs reliably on target organic shapes with minimal loss of input geometry.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fusing Structure from Motion and Simulation-Augmented Pose Regression
+  from Optical Flow for Challenging Indoor Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07250v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07250v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Ott, Lucas Heublein, David Rügamer, Bernd Bischl, Christopher Mutschler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The localization of objects is a crucial task in various applications such as
+robotics, virtual and augmented reality, and the transportation of goods in
+warehouses. Recent advances in deep learning have enabled the localization
+using monocular visual cameras. While structure from motion (SfM) predicts the
+absolute pose from a point cloud, absolute pose regression (APR) methods learn
+a semantic understanding of the environment through neural networks. However,
+both fields face challenges caused by the environment such as motion blur,
+lighting changes, repetitive patterns, and feature-less structures. This study
+aims to address these challenges by incorporating additional information and
+regularizing the absolute pose using relative pose regression (RPR) methods.
+RPR methods suffer under different challenges, i.e., motion blur. The optical
+flow between consecutive images is computed using the Lucas-Kanade algorithm,
+and the relative pose is predicted using an auxiliary small recurrent
+convolutional network. The fusion of absolute and relative poses is a complex
+task due to the mismatch between the global and local coordinate systems.
+State-of-the-art methods fusing absolute and relative poses use pose graph
+optimization (PGO) to regularize the absolute pose predictions using relative
+poses. In this work, we propose recurrent fusion networks to optimally align
+absolute and relative pose predictions to improve the absolute pose prediction.
+We evaluate eight different recurrent units and construct a simulation
+environment to pre-train the APR and RPR networks for better generalized
+training. Additionally, we record a large database of different scenarios in a
+challenging large-scale indoor environment that mimics a warehouse with
+transportation robots. We conduct hyperparameter searches and experiments to
+show the effectiveness of our recurrent fusion method compared to PGO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Get3DHuman: Lifting StyleGAN-Human into a 3D Generative Model using
+  Pixel-aligned Reconstruction Priors <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01162v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01162v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhangyang Xiong, Di Kang, Derong Jin, Weikai Chen, Linchao Bao, Shuguang Cui, Xiaoguang Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fast generation of high-quality 3D digital humans is important to a vast
+number of applications ranging from entertainment to professional concerns.
+Recent advances in differentiable rendering have enabled the training of 3D
+generative models without requiring 3D ground truths. However, the quality of
+the generated 3D humans still has much room to improve in terms of both
+fidelity and diversity. In this paper, we present Get3DHuman, a novel 3D human
+framework that can significantly boost the realism and diversity of the
+generated outcomes by only using a limited budget of 3D ground-truth data. Our
+key observation is that the 3D generator can profit from human-related priors
+learned through 2D human generators and 3D reconstructors. Specifically, we
+bridge the latent space of Get3DHuman with that of StyleGAN-Human via a
+specially-designed prior network, where the input latent code is mapped to the
+shape and texture feature volumes spanned by the pixel-aligned 3D
+reconstructor. The outcomes of the prior network are then leveraged as the
+supervisory signals for the main generator network. To ensure effective
+training, we further propose three tailored losses applied to the generated
+feature volumes and the intermediate feature maps. Extensive experiments
+demonstrate that Get3DHuman greatly outperforms the other state-of-the-art
+approaches and can support a wide range of applications including shape
+interpolation, shape re-texturing, and single-view reconstruction through
+latent inversion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023, project page:
+  https://x-zhangyang.github.io/2023_Get3DHuman/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty-inspired Open Set Learning for Retinal Anomaly
+  Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03981v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03981v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng Wang, Tian Lin, Lianyu Wang, Aidi Lin, Ke Zou, Xinxing Xu, Yi Zhou, Yuanyuan Peng, Qingquan Meng, Yiming Qian, Guoyao Deng, Zhiqun Wu, Junhong Chen, Jianhong Lin, Mingzhi Zhang, Weifang Zhu, Changqing Zhang, Daoqiang Zhang, Rick Siow Mong Goh, Yong Liu, Chi Pui Pang, Xinjian Chen, Haoyu Chen, Huazhu Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Failure to recognize samples from the classes unseen during training is a
+major limitation of artificial intelligence in the real-world implementation
+for recognition and classification of retinal anomalies. We established an
+uncertainty-inspired open-set (UIOS) model, which was trained with fundus
+images of 9 retinal conditions. Besides assessing the probability of each
+category, UIOS also calculated an uncertainty score to express its confidence.
+Our UIOS model with thresholding strategy achieved an F1 score of 99.55%,
+97.01% and 91.91% for the internal testing set, external target categories
+(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1
+score of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS
+correctly predicted high uncertainty scores, which would prompt the need for a
+manual check in the datasets of non-target categories retinal diseases,
+low-quality fundus images, and non-fundus images. UIOS provides a robust method
+for real-world screening of retinal anomalies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Morphological Image Analysis and Feature Extraction for Reasoning with
+  AI-based Defect Detection and Classification Models <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11643v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11643v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun Zhang, Georgina Cosma, Sarah Bugby, Axel Finke, Jason Watkins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the use of artificial intelligent (AI) models becomes more prevalent in
+industries such as engineering and manufacturing, it is essential that these
+models provide transparent reasoning behind their predictions. This paper
+proposes the AI-Reasoner, which extracts the morphological characteristics of
+defects (DefChars) from images and utilises decision trees to reason with the
+DefChar values. Thereafter, the AI-Reasoner exports visualisations (i.e.
+charts) and textual explanations to provide insights into outputs made by
+masked-based defect detection and classification models. It also provides
+effective mitigation strategies to enhance data pre-processing and overall
+model performance. The AI-Reasoner was tested on explaining the outputs of an
+IE Mask R-CNN model using a set of 366 images containing defects. The results
+demonstrated its effectiveness in explaining the IE Mask R-CNN model's
+predictions. Overall, the proposed AI-Reasoner provides a solution for
+improving the performance of AI models in industrial applications that require
+defect analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures, 5 tables; submitted to 2023 IEEE symposium series
+  on computational intelligence (SSCI)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ S3M: Scalable Statistical Shape Modeling through Unsupervised
+  Correspondences <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07515v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07515v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lennart Bastian, Alexander Baumann, Emily Hoppe, Vincent Bürgin, Ha Young Kim, Mahdi Saleh, Benjamin Busam, Nassir Navab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Statistical shape models (SSMs) are an established way to represent the
+anatomy of a population with various clinically relevant applications. However,
+they typically require domain expertise, and labor-intensive landmark
+annotations to construct. We address these shortcomings by proposing an
+unsupervised method that leverages deep geometric features and functional
+correspondences to simultaneously learn local and global shape structures
+across population anatomies. Our pipeline significantly improves unsupervised
+correspondence estimation for SSMs compared to baseline methods, even on highly
+irregular surface topologies. We demonstrate this for two different anatomical
+structures: the thyroid and a multi-chamber heart dataset. Furthermore, our
+method is robust enough to learn from noisy neural network predictions,
+potentially enabling scaling SSMs to larger patient populations without manual
+segmentation annotation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023. 13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ADPS: Asymmetric Distillation Post-Segmentation for Image Anomaly
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.10495v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.10495v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Xing, Hao Tang, Jinhui Tang, Zechao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Distillation-based Anomaly Detection (KDAD) methods rely on the
+teacher-student paradigm to detect and segment anomalous regions by contrasting
+the unique features extracted by both networks. However, existing KDAD methods
+suffer from two main limitations: 1) the student network can effortlessly
+replicate the teacher network's representations, and 2) the features of the
+teacher network serve solely as a ``reference standard" and are not fully
+leveraged. Toward this end, we depart from the established paradigm and instead
+propose an innovative approach called Asymmetric Distillation Post-Segmentation
+(ADPS). Our ADPS employs an asymmetric distillation paradigm that takes
+distinct forms of the same image as the input of the teacher-student networks,
+driving the student network to learn discriminating representations for
+anomalous regions.
+  Meanwhile, a customized Weight Mask Block (WMB) is proposed to generate a
+coarse anomaly localization mask that transfers the distilled knowledge
+acquired from the asymmetric paradigm to the teacher network. Equipped with
+WMB, the proposed Post-Segmentation Module (PSM) is able to effectively detect
+and segment abnormal regions with fine structures and clear boundaries.
+Experimental results demonstrate that the proposed ADPS outperforms the
+state-of-the-art methods in detecting and segmenting anomalies. Surprisingly,
+ADPS significantly improves Average Precision (AP) metric by 9% and 20% on the
+MVTec AD and KolektorSDD2 datasets, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11pages,9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recovering 3D Human Mesh from Monocular Images: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.01923v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.01923v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yating Tian, Hongwen Zhang, Yebin Liu, Limin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating human pose and shape from monocular images is a long-standing
+problem in computer vision. Since the release of statistical body models, 3D
+human mesh recovery has been drawing broader attention. With the same goal of
+obtaining well-aligned and physically plausible mesh results, two paradigms
+have been developed to overcome challenges in the 2D-to-3D lifting process: i)
+an optimization-based paradigm, where different data terms and regularization
+terms are exploited as optimization objectives; and ii) a regression-based
+paradigm, where deep learning techniques are embraced to solve the problem in
+an end-to-end fashion. Meanwhile, continuous efforts are devoted to improving
+the quality of 3D mesh labels for a wide range of datasets. Though remarkable
+progress has been achieved in the past decade, the task is still challenging
+due to flexible body motions, diverse appearances, complex environments, and
+insufficient in-the-wild annotations. To the best of our knowledge, this is the
+first survey that focuses on the task of monocular 3D human mesh recovery. We
+start with the introduction of body models and then elaborate recovery
+frameworks and training objectives by providing in-depth analyses of their
+strengths and weaknesses. We also summarize datasets, evaluation metrics, and
+benchmark results. Open issues and future directions are discussed in the end,
+hoping to motivate researchers and facilitate their research in this area. A
+regularly updated project page can be found at
+https://github.com/tinatiansjz/hmr-survey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE TPAMI, Survey on monocular 3D human mesh recovery,
+  Project page: https://github.com/tinatiansjz/hmr-survey</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Underwater Object Tracker: UOSTrack for Marine Organism Grasping of
+  Underwater Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.01482v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.01482v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunfeng Li, Bo Wang, Ye Li, Zhuoyan Liu, Wei Huo, Yueming Li, Jian Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A visual single-object tracker is an indispensable component of underwater
+vehicles (UVs) in marine organism grasping tasks. Its accuracy and stability
+are imperative to guide the UVs to perform grasping behavior. Although
+single-object trackers show competitive performance in the challenge of
+underwater image degradation, there are still issues with sample imbalance and
+exclusion of similar objects that need to be addressed for application in
+marine organism grasping. This paper proposes Underwater OSTrack (UOSTrack),
+which consists of underwater image and open-air sequence hybrid training
+(UOHT), and motion-based post-processing (MBPP). The UOHT training paradigm is
+designed to train the sample-imbalanced underwater tracker so that the tracker
+is exposed to a great number of underwater domain training samples and learns
+the feature expressions. The MBPP paradigm is proposed to exclude similar
+objects. It uses the estimation box predicted with a Kalman filter and the
+candidate boxes in the response map to relocate the lost tracked object in the
+candidate area. UOSTrack achieves an average performance improvement of 4.41%
+and 7.98% maximum compared to state-of-the-art methods on various benchmarks,
+respectively. Field experiments have verified the accuracy and stability of our
+proposed UOSTrack for UVs in marine organism grasping tasks. More details can
+be found at https://github.com/LiYunfengLYF/UOSTrack.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffusionDepth: Diffusion Denoising Approach for Monocular Depth
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05021v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05021v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiqun Duan, Xianda Guo, Zheng Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monocular depth estimation is a challenging task that predicts the pixel-wise
+depth from a single 2D image. Current methods typically model this problem as a
+regression or classification task. We propose DiffusionDepth, a new approach
+that reformulates monocular depth estimation as a denoising diffusion process.
+It learns an iterative denoising process to `denoise' random depth distribution
+into a depth map with the guidance of monocular visual conditions. The process
+is performed in the latent space encoded by a dedicated depth encoder and
+decoder. Instead of diffusing ground truth (GT) depth, the model learns to
+reverse the process of diffusing the refined depth of itself into random depth
+distribution. This self-diffusion formulation overcomes the difficulty of
+applying generative models to sparse GT depth scenarios. The proposed approach
+benefits this task by refining depth estimation step by step, which is superior
+for generating accurate and highly detailed depth maps. Experimental results on
+KITTI and NYU-Depth-V2 datasets suggest that a simple yet efficient diffusion
+approach could reach state-of-the-art performance in both indoor and outdoor
+scenarios with acceptable inference time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D Human Pose Estimation via Intuitive Physics <span class="chip">CVPR'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.18246v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.18246v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashank Tripathi, Lea Müller, Chun-Hao P. Huang, Omid Taheri, Michael J. Black, Dimitrios Tzionas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating 3D humans from images often produces implausible bodies that lean,
+float, or penetrate the floor. Such methods ignore the fact that bodies are
+typically supported by the scene. A physics engine can be used to enforce
+physical plausibility, but these are not differentiable, rely on unrealistic
+proxy bodies, and are difficult to integrate into existing optimization and
+learning frameworks. In contrast, we exploit novel intuitive-physics (IP) terms
+that can be inferred from a 3D SMPL body interacting with the scene. Inspired
+by biomechanics, we infer the pressure heatmap on the body, the Center of
+Pressure (CoP) from the heatmap, and the SMPL body's Center of Mass (CoM). With
+these, we develop IPMAN, to estimate a 3D body from a color image in a "stable"
+configuration by encouraging plausible floor contact and overlapping CoP and
+CoM. Our IP terms are intuitive, easy to implement, fast to compute,
+differentiable, and can be integrated into existing optimization and regression
+methods. We evaluate IPMAN on standard datasets and MoYo, a new dataset with
+synchronized multi-view images, ground-truth 3D bodies with complex poses,
+body-floor contact, CoM and pressure. IPMAN produces more plausible results
+than the state of the art, improving accuracy for static poses, while not
+hurting dynamic ones. Code and data are available for research at
+https://ipman.is.tue.mpg.de.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in CVPR'23. Project page: https://ipman.is.tue.mpg.de</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text-guided Eyeglasses Manipulation with Spatial Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12539v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12539v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacheng Wang, Ping Liu, Jingen Liu, Wei Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual try-on of eyeglasses involves placing eyeglasses of different shapes
+and styles onto a face image without physically trying them on. While existing
+methods have shown impressive results, the variety of eyeglasses styles is
+limited and the interactions are not always intuitive or efficient. To address
+these limitations, we propose a Text-guided Eyeglasses Manipulation method that
+allows for control of the eyeglasses shape and style based on a binary mask and
+text, respectively. Specifically, we introduce a mask encoder to extract mask
+conditions and a modulation module that enables simultaneous injection of text
+and mask conditions. This design allows for fine-grained control of the
+eyeglasses' appearance based on both textual descriptions and spatial
+constraints. Our approach includes a disentangled mapper and a decoupling
+strategy that preserves irrelevant areas, resulting in better local editing. We
+employ a two-stage training scheme to handle the different convergence speeds
+of the various modality conditions, successfully controlling both the shape and
+style of eyeglasses. Extensive comparison experiments and ablation analyses
+demonstrate the effectiveness of our approach in achieving diverse eyeglasses
+styles while preserving irrelevant areas.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revised version: add some experiments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MatSpectNet: Material Segmentation Network with Domain-Aware and
+  Physically-Constrained Hyperspectral Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11466v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11466v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuwen Heng, Yihong Wu, Jiawen Chen, Srinandan Dasmahapatra, Hansung Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Achieving accurate material segmentation for 3-channel RGB images is
+challenging due to the considerable variation in a material's appearance.
+Hyperspectral images, which are sets of spectral measurements sampled at
+multiple wavelengths, theoretically offer distinct information for material
+identification, as variations in intensity of electromagnetic radiation
+reflected by a surface depend on the material composition of a scene. However,
+existing hyperspectral datasets are impoverished regarding the number of images
+and material categories for the dense material segmentation task, and
+collecting and annotating hyperspectral images with a spectral camera is
+prohibitively expensive. To address this, we propose a new model, the
+MatSpectNet to segment materials with recovered hyperspectral images from RGB
+images. The network leverages the principles of colour perception in modern
+cameras to constrain the reconstructed hyperspectral images and employs the
+domain adaptation method to generalise the hyperspectral reconstruction
+capability from a spectral recovery dataset to material segmentation datasets.
+The reconstructed hyperspectral images are further filtered using learned
+response curves and enhanced with human perception. The performance of
+MatSpectNet is evaluated on the LMD dataset as well as the OpenSurfaces
+dataset. Our experiments demonstrate that MatSpectNet attains a 1.60% increase
+in average pixel accuracy and a 3.42% improvement in mean class accuracy
+compared with the most recent publication. The project code is attached to the
+supplementary material and will be published on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages main paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RED-PSM: Regularization by Denoising of Partially Separable Models for
+  Dynamic Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03483v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03483v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Berk Iskender, Marc L. Klasky, Yoram Bresler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at
+each time instant using its undersampled measurements. In particular, in the
+case of dynamic tomography, only a single projection at a single view angle may
+be available at a time, making the problem severely ill-posed. In this work, we
+propose an approach, RED-PSM, which combines for the first time two powerful
+techniques to address this challenging imaging problem. The first, are
+partially separable models, which have been used to efficiently introduce a
+low-rank prior for the spatio-temporal object. The second is the recent
+Regularization by Denoising (RED), which provides a flexible framework to
+exploit the impressive performance of state-of-the-art image denoising
+algorithms, for various inverse problems. We propose a partially separable
+objective with RED and a computationally efficient and scalable optimization
+scheme with variable splitting and ADMM. Theoretical analysis proves the
+convergence of our objective to a value corresponding to a stationary point
+satisfying the first-order optimality conditions. Convergence is accelerated by
+a particular projection-domain-based initialization. We demonstrate the
+performance and computational improvements of our proposed RED-PSM with a
+learned image denoiser by comparing it to a recent deep-prior-based method
+known as TD-DIP. Although the main focus is on dynamic tomography, we also show
+the performance advantages of RED-PSM in a cardiac dynamic MRI setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reconstruction-Aware Prior Distillation for Semi-supervised Point Cloud
+  Completion <span class="chip">IJCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.09186v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.09186v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaoxin Fan, Yulin He, Zhicheng Wang, Kejian Wu, Hongyan Liu, Jun He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world sensors often produce incomplete, irregular, and noisy point
+clouds, making point cloud completion increasingly important. However, most
+existing completion methods rely on large paired datasets for training, which
+is labor-intensive. This paper proposes RaPD, a novel semi-supervised point
+cloud completion method that reduces the need for paired datasets. RaPD
+utilizes a two-stage training scheme, where a deep semantic prior is learned in
+stage 1 from unpaired complete and incomplete point clouds, and a
+semi-supervised prior distillation process is introduced in stage 2 to train a
+completion network using only a small number of paired samples. Additionally, a
+self-supervised completion module is introduced to improve performance using
+unpaired incomplete point clouds. Experiments on multiple datasets show that
+RaPD outperforms previous methods in both homologous and heterologous
+scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IJCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Target-oriented Sentiment Classification with Sequential Cross-modal
+  Semantic Graph <span class="chip">ICANN 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.09417v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.09417v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufeng Huang, Zhuo Chen, Jiaoyan Chen, Jeff Z. Pan, Zhen Yao, Wen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal aspect-based sentiment classification (MABSC) is task of
+classifying the sentiment of a target entity mentioned in a sentence and an
+image. However, previous methods failed to account for the fine-grained
+semantic association between the image and the text, which resulted in limited
+identification of fine-grained image aspects and opinions. To address these
+limitations, in this paper we propose a new approach called SeqCSG, which
+enhances the encoder-decoder sentiment classification framework using
+sequential cross-modal semantic graphs. SeqCSG utilizes image captions and
+scene graphs to extract both global and local fine-grained image information
+and considers them as elements of the cross-modal semantic graph along with
+tokens from tweets. The sequential cross-modal semantic graph is represented as
+a sequence with a multi-modal adjacency matrix indicating relationships between
+elements. Experimental results show that the approach outperforms existing
+methods and achieves state-of-the-art performance on two standard datasets.
+Further analysis has demonstrated that the model can implicitly learn the
+correlation between fine-grained information of the image and the text with the
+given target. Our code is available at https://github.com/zjukg/SeqCSG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICANN 2023, https://github.com/zjukg/SeqCSG</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Directly-Trained Spiking Neural Networks for Object Detection <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11411v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11411v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiaoyi Su, Yuhong Chou, Yifan Hu, Jianing Li, Shijie Mei, Ziyang Zhang, Guoqi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking neural networks (SNNs) are brain-inspired energy-efficient models
+that encode information in spatiotemporal dynamics. Recently, deep SNNs trained
+directly have shown great success in achieving high performance on
+classification tasks with very few time steps. However, how to design a
+directly-trained SNN for the regression task of object detection still remains
+a challenging problem. To address this problem, we propose EMS-YOLO, a novel
+directly-trained SNN framework for object detection, which is the first trial
+to train a deep SNN with surrogate gradients for object detection rather than
+ANN-SNN conversion strategies. Specifically, we design a full-spike residual
+block, EMS-ResNet, which can effectively extend the depth of the
+directly-trained SNN with low power consumption. Furthermore, we theoretically
+analyze and prove the EMS-ResNet could avoid gradient vanishing or exploding.
+The results demonstrate that our approach outperforms the state-of-the-art
+ANN-SNN conversion methods (at least 500 time steps) in extremely fewer time
+steps (only 4 time steps). It is shown that our model could achieve comparable
+performance to the ANN with the same architecture while consuming 5.83 times
+less energy on the frame-based COCO Dataset and the event-based Gen1 Dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A large calcium-imaging <span class="highlight-title">dataset</span> reveals a systematic V4 organization for
+  natural scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00932v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00932v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianye Wang, Haoxuan Yao, Tai Sing Lee, Jiayi Hong, Yang Li, Hongfei Jiang, Ian Max Andolina, Shiming Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The visual system evolved to process natural scenes, yet most of our
+understanding of the topology and function of visual cortex derives from
+studies using artificial stimuli. To gain deeper insights into visual
+processing of natural scenes, we utilized widefield calcium-imaging of primate
+V4 in response to many natural images, generating a large dataset of
+columnar-scale responses. We used this dataset to build a digital twin of V4
+via deep learning, generating a detailed topographical map of natural image
+preferences at each cortical position. The map revealed clustered functional
+domains for specific classes of natural image features. These ranged from
+surface-related attributes like color and texture to shape-related features
+such as edges, curvature, and facial features. We validated the model-predicted
+domains with additional widefield calcium-imaging and single-cell resolution
+two-photon imaging. Our study illuminates the detailed topological organization
+and neural codes in V4 that represent natural scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vision Meets Definitions: Unsupervised Visual Word Sense Disambiguation
+  Incorporating Gloss Information <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01788v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01788v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sunjae Kwon, Rishabh Garodia, Minhwa Lee, Zhichao Yang, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Word Sense Disambiguation (VWSD) is a task to find the image that most
+accurately depicts the correct sense of the target word for the given context.
+Previously, image-text matching models often suffered from recognizing
+polysemous words. This paper introduces an unsupervised VWSD approach that uses
+gloss information of an external lexical knowledge-base, especially the sense
+definitions. Specifically, we suggest employing Bayesian inference to
+incorporate the sense definitions when sense information of the answer is not
+provided. In addition, to ameliorate the out-of-dictionary (OOD) issue, we
+propose a context-aware definition generation with GPT-3. Experimental results
+show that the VWSD performance significantly increased with our Bayesian
+inference-based approach. In addition, our context-aware definition generation
+achieved prominent performance improvement in OOD examples exhibiting better
+performance than the existing definition generation method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, https://aclanthology.org/2023.acl-long.88</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GaitRef: Gait Recognition with Refined Sequential Skeletons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07916v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07916v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haidong Zhu, Wanrong Zheng, Zhaoheng Zheng, Ram Nevatia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying humans with their walking sequences, known as gait recognition,
+is a useful biometric understanding task as it can be observed from a long
+distance and does not require cooperation from the subject. Two common
+modalities used for representing the walking sequence of a person are
+silhouettes and joint skeletons. Silhouette sequences, which record the
+boundary of the walking person in each frame, may suffer from the variant
+appearances from carried-on objects and clothes of the person. Framewise joint
+detections are noisy and introduce some jitters that are not consistent with
+sequential detections. In this paper, we combine the silhouettes and skeletons
+and refine the framewise joint predictions for gait recognition. With temporal
+information from the silhouette sequences. We show that the refined skeletons
+can improve gait recognition performance without extra annotations. We compare
+our methods on four public datasets, CASIA-B, OUMVLP, Gait3D and GREW, and show
+state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IJCB 2023. Code is available at
+  https://github.com/haidongz-usc/GaitRef</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A$^2$-UAV: Application-Aware Content and Network Optimization of
+  Edge-Assisted UAV Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.06363v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.06363v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Coletta, Flavio Giorgi, Gaia Maselli, Matteo Prata, Domenicomichele Silvestri, Jonathan Ashdown, Francesco Restuccia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To perform advanced surveillance, Unmanned Aerial Vehicles (UAVs) require the
+execution of edge-assisted computer vision (CV) tasks. In multi-hop UAV
+networks, the successful transmission of these tasks to the edge is severely
+challenged due to severe bandwidth constraints. For this reason, we propose a
+novel A$^2$-UAV framework to optimize the number of correctly executed tasks at
+the edge. In stark contrast with existing art, we take an application-aware
+approach and formulate a novel pplication-Aware Task Planning Problem
+(A$^2$-TPP) that takes into account (i) the relationship between deep neural
+network (DNN) accuracy and image compression for the classes of interest based
+on the available dataset, (ii) the target positions, (iii) the current
+energy/position of the UAVs to optimize routing, data pre-processing and target
+assignment for each UAV. We demonstrate A$^2$-TPP is NP-Hard and propose a
+polynomial-time algorithm to solve it efficiently. We extensively evaluate
+A$^2$-UAV through real-world experiments with a testbed composed by four DJI
+Mavic Air 2 UAVs. We consider state-of-the-art image classification tasks with
+four different DNN models (i.e., DenseNet, ResNet152, ResNet50 and
+MobileNet-V2) and object detection tasks using YoloV4 trained on the ImageNet
+dataset. Results show that A$^2$-UAV attains on average around 38% more
+accomplished tasks than the state-of-the-art, with 400% more accomplished tasks
+when the number of targets increases significantly. To allow full
+reproducibility, we pledge to share datasets and code with the research
+community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INFOCOM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Asynchronous Event-Based Algorithm for Periodic Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.04691v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.04691v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David El-Chai Ben-Ezra, Ron Arad, Ayelet Padowicz, Israel Tugendhaft
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Let $0\leq\tau_{1}\leq\tau_{2}\leq\cdots\leq\tau_{m}\leq1$, originated from a
+uniform distribution. Let also $\epsilon,\delta\in\mathbb{R}$, and
+$d\in\mathbb{N}$. What is the probability of having more than $d$ adjacent
+$\tau_{i}$-s pairs that the distance between them is $\delta$, up to an error
+$\epsilon$ ? In this paper we are going to show how this untreated theoretical
+probabilistic problem arises naturally from the motivation of analyzing a
+simple asynchronous algorithm for detection of signals with a known frequency,
+using the novel technology of an event camera.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D Medical Image Segmentation based on multi-scale MPU-Net 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05799v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05799v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeqiu. Yu, Shuo. Han, Ziheng. Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The high cure rate of cancer is inextricably linked to physicians' accuracy
+in diagnosis and treatment, therefore a model that can accomplish
+high-precision tumor segmentation has become a necessity in many applications
+of the medical industry. It can effectively lower the rate of misdiagnosis
+while considerably lessening the burden on clinicians. However, fully automated
+target organ segmentation is problematic due to the irregular stereo structure
+of 3D volume organs. As a basic model for this class of real applications,
+U-Net excels. It can learn certain global and local features, but still lacks
+the capacity to grasp spatial long-range relationships and contextual
+information at multiple scales. This paper proposes a tumor segmentation model
+MPU-Net for patient volume CT images, which is inspired by Transformer with a
+global attention mechanism. By combining image serialization with the Position
+Attention Module, the model attempts to comprehend deeper contextual
+dependencies and accomplish precise positioning. Each layer of the decoder is
+also equipped with a multi-scale module and a cross-attention mechanism. The
+capability of feature extraction and integration at different levels has been
+enhanced, and the hybrid loss function developed in this study can better
+exploit high-resolution characteristic information. Moreover, the suggested
+architecture is tested and evaluated on the Liver Tumor Segmentation Challenge
+2017 (LiTS 2017) dataset. Compared with the benchmark model U-Net, MPU-Net
+shows excellent segmentation results. The dice, accuracy, precision,
+specificity, IOU, and MCC metrics for the best model segmentation results are
+92.17%, 99.08%, 91.91%, 99.52%, 85.91%, and 91.74%, respectively. Outstanding
+indicators in various aspects illustrate the exceptional performance of this
+framework in automatic medical image segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automating Wood Species Detection and Classification in Microscopic
+  Images of Fibrous Materials with Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09588v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09588v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lars Nieradzik, Jördis Sieburg-Rockel, Stephanie Helmling, Janis Keuper, Thomas Weibel, Andrea Olbrich, Henrike Stephani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We have developed a methodology for the systematic generation of a large
+image dataset of macerated wood references, which we used to generate image
+data for nine hardwood genera. This is the basis for a substantial approach to
+automate, for the first time, the identification of hardwood species in
+microscopic images of fibrous materials by deep learning. Our methodology
+includes a flexible pipeline for easy annotation of vessel elements. We compare
+the performance of different neural network architectures and hyperparameters.
+Our proposed method performs similarly well to human experts. In the future,
+this will improve controls on global wood fiber product flows to protect
+forests.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retinex-based Image Denoising / Contrast Enhancement using Gradient
+  Graph Laplacian Regularizer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02625v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02625v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeganeh Gharedaghi, Gene Cheung, Xianming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Images captured in poorly lit conditions are often corrupted by acquisition
+noise. Leveraging recent advances in graph-based regularization, we propose a
+fast Retinex-based restoration scheme that denoises and contrast-enhances an
+image. Specifically, by Retinex theory we first assume that each image pixel is
+a multiplication of its reflectance and illumination components. We next assume
+that the reflectance and illumination components are piecewise constant (PWC)
+and continuous piecewise planar (PWP) signals, which can be recovered via graph
+Laplacian regularizer (GLR) and gradient graph Laplacian regularizer (GGLR)
+respectively. We formulate quadratic objectives regularized by GLR and GGLR,
+which are minimized alternately until convergence by solving linear systems --
+with improved condition numbers via proposed preconditioners -- via conjugate
+gradient (CG) efficiently. Experimental results show that our algorithm
+achieves competitive visual image quality while reducing computation complexity
+noticeably.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">11</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HeteFedRec: Federated Recommender Systems with Model Heterogeneity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12810v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12810v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Yuan, Liang Qu, Lizhen Cui, Yongxin Tong, Xiaofang Zhou, Hongzhi Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Owing to the nature of privacy protection, federated recommender systems
+(FedRecs) have garnered increasing interest in the realm of on-device
+recommender systems. However, most existing FedRecs only allow participating
+clients to collaboratively train a recommendation model of the same public
+parameter size. Training a model of the same size for all clients can lead to
+suboptimal performance since clients possess varying resources. For example,
+clients with limited training data may prefer to train a smaller recommendation
+model to avoid excessive data consumption, while clients with sufficient data
+would benefit from a larger model to achieve higher recommendation accuracy. To
+address the above challenge, this paper introduces HeteFedRec, a novel FedRec
+framework that enables the assignment of personalized model sizes to
+participants. In HeteFedRec, we present a heterogeneous recommendation model
+aggregation strategy, including a unified dual-task learning mechanism and a
+dimensional decorrelation regularization, to allow knowledge aggregation among
+recommender models of different sizes. Additionally, a relation-based ensemble
+knowledge distillation method is proposed to effectively distil knowledge from
+heterogeneous item embeddings. Extensive experiments conducted on three
+real-world recommendation datasets demonstrate the effectiveness and efficiency
+of HeteFedRec in training federated recommender systems under heterogeneous
+settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RRAML: Reinforced Retrieval Augmented Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12798v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12798v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Bacciu, Florin Cocunasu, Federico Siciliano, Fabrizio Silvestri, Nicola Tonellotto, Giovanni Trappolini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large language models (LLMs) has revolutionized machine
+learning and related fields, showcasing remarkable abilities in comprehending,
+generating, and manipulating human language. However, their conventional usage
+through API-based text prompt submissions imposes certain limitations in terms
+of context constraints and external source availability. To address these
+challenges, we propose a novel framework called Reinforced Retrieval Augmented
+Machine Learning (RRAML). RRAML integrates the reasoning capabilities of LLMs
+with supporting information retrieved by a purpose-built retriever from a vast
+user-provided database. By leveraging recent advancements in reinforcement
+learning, our method effectively addresses several critical challenges.
+Firstly, it circumvents the need for accessing LLM gradients. Secondly, our
+method alleviates the burden of retraining LLMs for specific tasks, as it is
+often impractical or impossible due to restricted access to the model and the
+computational intensity involved. Additionally we seamlessly link the
+retriever's task with the reasoner, mitigating hallucinations and reducing
+irrelevant, and potentially damaging retrieved documents. We believe that the
+research agenda outlined in this paper has the potential to profoundly impact
+the field of AI, democratizing access to and utilization of LLMs for a wide
+range of entities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unbiased Delayed Feedback Label Correction for Conversion Rate
+  Prediction <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12756v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12756v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Wang, Peijie Sun, Min Zhang, Qinglin Jia, Jingjie Li, Shaoping Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversion rate prediction is critical to many online applications such as
+digital display advertising. To capture dynamic data distribution, industrial
+systems often require retraining models on recent data daily or weekly.
+However, the delay of conversion behavior usually leads to incorrect labeling,
+which is called delayed feedback problem. Existing work may fail to introduce
+the correct information about false negative samples due to data sparsity and
+dynamic data distribution. To directly introduce the correct feedback label
+information, we propose an Unbiased delayed feedback Label Correction framework
+(ULC), which uses an auxiliary model to correct labels for observed negative
+feedback samples. Firstly, we theoretically prove that the label-corrected loss
+is an unbiased estimate of the oracle loss using true labels. Then, as there
+are no ready training data for label correction, counterfactual labeling is
+used to construct artificial training data. Furthermore, since counterfactual
+labeling utilizes only partial training data, we design an embedding-based
+alternative training method to enhance performance. Comparative experiments on
+both public and private datasets and detailed analyses show that our proposed
+approach effectively alleviates the delayed feedback problem and consistently
+outperforms the previous state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by KDD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-refining of Pseudo Labels for Music Source Separation with Noisy
+  Labeled Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12576v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12576v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junghyun Koo, Yunkee Chae, Chang-Bin Jeon, Kyogu Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music source separation (MSS) faces challenges due to the limited
+availability of correctly-labeled individual instrument tracks. With the push
+to acquire larger datasets to improve MSS performance, the inevitability of
+encountering mislabeled individual instrument tracks becomes a significant
+challenge to address. This paper introduces an automated technique for refining
+the labels in a partially mislabeled dataset. Our proposed self-refining
+technique, employed with a noisy-labeled dataset, results in only a 1% accuracy
+degradation in multi-label instrument recognition compared to a classifier
+trained on a clean-labeled dataset. The study demonstrates the importance of
+refining noisy-labeled data in MSS model training and shows that utilizing the
+refined dataset leads to comparable results derived from a clean-labeled
+dataset. Notably, upon only access to a noisy dataset, MSS models trained on a
+self-refined dataset even outperform those trained on a dataset refined with a
+classifier trained on clean labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24th International Society for Music Information Retrieval Conference
+  (ISMIR 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FaFCNN: A General Disease Classification Framework Based on Feature
+  Fusion Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Menglin Kong, Shaojie Zhao, Juan Cheng, Xingquan Li, Ri Su, Muzhou Hou, Cong Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There are two fundamental problems in applying deep learning/machine learning
+methods to disease classification tasks, one is the insufficient number and
+poor quality of training samples; another one is how to effectively fuse
+multiple source features and thus train robust classification models. To
+address these problems, inspired by the process of human learning knowledge, we
+propose the Feature-aware Fusion Correlation Neural Network (FaFCNN), which
+introduces a feature-aware interaction module and a feature alignment module
+based on domain adversarial learning. This is a general framework for disease
+classification, and FaFCNN improves the way existing methods obtain sample
+correlation features. The experimental results show that training using
+augmented features obtained by pre-training gradient boosting decision tree
+yields more performance gains than random-forest based methods. On the
+low-quality dataset with a large amount of missing data in our setup, FaFCNN
+obtains a consistently optimal performance compared to competitive baselines.
+In addition, extensive experiments demonstrate the robustness of the proposed
+method and the effectiveness of each component of the model\footnote{Accepted
+in IEEE SMC2023}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Robustness of Sequential Recommender Systems Against
+  Training Data Perturbations: an Empirical Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Filippo Betello, Federico Siciliano, Pushkar Mishra, Fabrizio Silvestri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential Recommender Systems (SRSs) have been widely used to model user
+behavior over time, but their robustness in the face of perturbations to
+training data is a critical issue. In this paper, we conduct an empirical study
+to investigate the effects of removing items at different positions within a
+temporally ordered sequence. We evaluate two different SRS models on multiple
+datasets, measuring their performance using Normalized Discounted Cumulative
+Gain (NDCG) and Rank Sensitivity List metrics. Our results demonstrate that
+removing items at the end of the sequence significantly impacts performance,
+with NDCG decreasing up to 60\%, while removing items from the beginning or
+middle has no significant effect. These findings highlight the importance of
+considering the position of the perturbed items in the training data and shall
+inform the design of more robust SRSs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PubMed and Beyond: Recent Advances and Best Practices in Biomedical
+  Literature Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09683v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09683v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiao Jin, Robert Leaman, Zhiyong Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biomedical research yields a wealth of information, much of which is only
+accessible through the literature. Consequently, literature search is an
+essential tool for building on prior knowledge in clinical and biomedical
+research. Although recent improvements in artificial intelligence have expanded
+functionality beyond keyword-based search, these advances may be unfamiliar to
+clinicians and researchers. In response, we present a survey of literature
+search tools tailored to both general and specific information needs in
+biomedicine, with the objective of helping readers efficiently fulfill their
+information needs. We first examine the widely used PubMed search engine,
+discussing recent improvements and continued challenges. We then describe
+literature search tools catering to five specific information needs: 1.
+Identifying high-quality clinical research for evidence-based medicine. 2.
+Retrieving gene-related information for precision medicine and genomics. 3.
+Searching by meaning, including natural language questions. 4. Locating related
+articles with literature recommendation. 5. Mining literature to discover
+associations between concepts such as diseases and genetic variants.
+Additionally, we cover practical considerations and best practices for choosing
+and using these tools. Finally, we provide a perspective on the future of
+literature search engines, considering recent breakthroughs in large language
+models such as ChatGPT. In summary, our survey provides a comprehensive view of
+biomedical literature search functionalities with 36 publicly available tools.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 6 figures, 36 tools</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unmasking Falsehoods in <span class="highlight-title">Review</span>s: An Exploration of NLP Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10617v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10617v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anusuya Baby Hari Krishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the contemporary digital landscape, online reviews have become an
+indispensable tool for promoting products and services across various
+businesses. Marketers, advertisers, and online businesses have found incentives
+to create deceptive positive reviews for their products and negative reviews
+for their competitors' offerings. As a result, the writing of deceptive reviews
+has become an unavoidable practice for businesses seeking to promote themselves
+or undermine their rivals. Detecting such deceptive reviews has become an
+intense and ongoing area of research. This research paper proposes a machine
+learning model to identify deceptive reviews, with a particular focus on
+restaurants. This study delves into the performance of numerous experiments
+conducted on a dataset of restaurant reviews known as the Deceptive Opinion
+Spam Corpus. To accomplish this, an n-gram model and max features are developed
+to effectively identify deceptive content, particularly focusing on fake
+reviews. A benchmark study is undertaken to explore the performance of two
+different feature extraction techniques, which are then coupled with five
+distinct machine learning classification algorithms. The experimental results
+reveal that the passive aggressive classifier stands out among the various
+algorithms, showcasing the highest accuracy not only in text classification but
+also in identifying fake reviews. Moreover, the research delves into data
+augmentation and implements various deep learning techniques to further enhance
+the process of detecting deceptive reviews. The findings shed light on the
+efficacy of the proposed machine learning approach and offer valuable insights
+into dealing with deceptive reviews in the realm of online businesses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Classification of Consumer Belief Statements From Social Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.15498v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.15498v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, Wenbin Le, Hannah Danner, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media offer plenty of information to perform market research in order
+to meet the requirements of customers. One way how this research is conducted
+is that a domain expert gathers and categorizes user-generated content into a
+complex and fine-grained class structure. In many of such cases, little data
+meets complex annotations. It is not yet fully understood how this can be
+leveraged successfully for classification. We examine the classification
+accuracy of expert labels when used with a) many fine-grained classes and b)
+few abstract classes. For scenario b) we compare abstract class labels given by
+the domain expert as baseline and by automatic hierarchical clustering. We
+compare this to another baseline where the entire class structure is given by a
+completely unsupervised clustering approach. By doing so, this work can serve
+as an example of how complex expert annotations are potentially beneficial and
+can be utilized in the most optimal way for opinion mining in highly specific
+domains. By exploring across a range of techniques and experiments, we find
+that automated class abstraction approaches in particular the unsupervised
+approach performs remarkably well against domain expert baseline on text
+classification tasks. This has the potential to inspire opinion mining
+applications in order to support market researchers in practice and to inspire
+fine-grained automated content analysis on a large scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Case Study and Qualitative Analysis of Simple Cross-Lingual Opinion
+  Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.02259v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.02259v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, Wing Sheung Leung, Qiaoxi Liu, Hannah Danner, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  User-generated content from social media is produced in many languages,
+making it technically challenging to compare the discussed themes from one
+domain across different cultures and regions. It is relevant for domains in a
+globalized world, such as market research, where people from two nations and
+markets might have different requirements for a product. We propose a simple,
+modern, and effective method for building a single topic model with sentiment
+analysis capable of covering multiple languages simultanteously, based on a
+pre-trained state-of-the-art deep neural network for natural language
+understanding. To demonstrate its feasibility, we apply the model to newspaper
+articles and user comments of a specific domain, i.e., organic food products
+and related consumption behavior. The themes match across languages.
+Additionally, we obtain an high proportion of stable and domain-relevant
+topics, a meaningful relation between topics and their respective textual
+contents, and an interpretable representation for social media documents.
+Marketing can potentially benefit from our method, since it provides an
+easy-to-use means of addressing specific customer interests from different
+market regions around the globe. For reproducibility, we provide the code,
+data, and results of our study.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 tables, 5 figures, full paper, peer-reviewed, published
+  at KDIR/IC3k 2021 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Similarity search in the blink of an eye with compressed indices <span class="chip">VLDB 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04759v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04759v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cecilia Aguerrebere, Ishwar Bhati, Mark Hildebrand, Mariano Tepper, Ted Willke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, data is represented by vectors. Retrieving those vectors, among
+millions and billions, that are similar to a given query is a ubiquitous
+problem, known as similarity search, of relevance for a wide range of
+applications. Graph-based indices are currently the best performing techniques
+for billion-scale similarity search. However, their random-access memory
+pattern presents challenges to realize their full potential. In this work, we
+present new techniques and systems for creating faster and smaller graph-based
+indices. To this end, we introduce a novel vector compression method,
+Locally-adaptive Vector Quantization (LVQ), that uses per-vector scaling and
+scalar quantization to improve search performance with fast similarity
+computations and a reduced effective bandwidth, while decreasing memory
+footprint and barely impacting accuracy. LVQ, when combined with a new
+high-performance computing system for graph-based similarity search,
+establishes the new state of the art in terms of performance and memory
+footprint. For billions of vectors, LVQ outcompetes the second-best
+alternatives: (1) in the low-memory regime, by up to 20.7x in throughput with
+up to a 3x memory footprint reduction, and (2) in the high-throughput regime by
+5.8x with 1.4x less memory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>VLDB 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">138</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parallel $Q$-Learning: Scaling Off-policy Reinforcement Learning under
+  Massively Parallel Simulation <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12983v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12983v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zechu Li, Tao Chen, Zhang-Wei Hong, Anurag Ajay, Pulkit Agrawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning is time-consuming for complex tasks due to the need
+for large amounts of training data. Recent advances in GPU-based simulation,
+such as Isaac Gym, have sped up data collection thousands of times on a
+commodity GPU. Most prior works used on-policy methods like PPO due to their
+simplicity and ease of scaling. Off-policy methods are more data efficient but
+challenging to scale, resulting in a longer wall-clock training time. This
+paper presents a Parallel $Q$-Learning (PQL) scheme that outperforms PPO in
+wall-clock time while maintaining superior sample efficiency of off-policy
+learning. PQL achieves this by parallelizing data collection, policy learning,
+and value learning. Different from prior works on distributed off-policy
+learning, such as Apex, our scheme is designed specifically for massively
+parallel GPU-based simulation and optimized to work on a single workstation. In
+experiments, we demonstrate that $Q$-learning can be scaled to \textit{tens of
+thousands of parallel environments} and investigate important factors affecting
+learning speed. The code is available at https://github.com/Improbable-AI/pql.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D-LLM: Injecting the 3D World into Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yining Hong, Haoyu Zhen, Peihao Chen, Shuhong Zheng, Yilun Du, Zhenfang Chen, Chuang Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) and Vision-Language Models (VLMs) have been
+proven to excel at multiple tasks, such as commonsense reasoning. Powerful as
+these models can be, they are not grounded in the 3D physical world, which
+involves richer concepts such as spatial relationships, affordances, physics,
+layout, and so on. In this work, we propose to inject the 3D world into large
+language models and introduce a whole new family of 3D-LLMs. Specifically,
+3D-LLMs can take 3D point clouds and their features as input and perform a
+diverse set of 3D-related tasks, including captioning, dense captioning, 3D
+question answering, task decomposition, 3D grounding, 3D-assisted dialog,
+navigation, and so on. Using three types of prompting mechanisms that we
+design, we are able to collect over 300k 3D-language data covering these tasks.
+To efficiently train 3D-LLMs, we first utilize a 3D feature extractor that
+obtains 3D features from rendered multi- view images. Then, we use 2D VLMs as
+our backbones to train our 3D-LLMs. By introducing a 3D localization mechanism,
+3D-LLMs can better capture 3D spatial information. Experiments on ScanQA show
+that our model outperforms state-of-the-art baselines by a large margin (e.g.,
+the BLEU-1 score surpasses state-of-the-art score by 9%). Furthermore,
+experiments on our held-in datasets for 3D captioning, task composition, and
+3D-assisted dialogue show that our model outperforms 2D VLMs. Qualitative
+examples also show that our model could perform more tasks beyond the scope of
+existing LLMs and VLMs. Project Page: : https://vis-www.cs.umass.edu/3dllm/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: : https://vis-www.cs.umass.edu/3dllm/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Isometric Stochastic Optimizer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12979v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12979v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacob Jackson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Adam optimizer is the standard choice in deep learning applications. I
+propose a simple explanation of Adam's success: it makes each parameter's step
+size independent of the norms of the other parameters. Based on this principle
+I derive Iso, a new optimizer which makes the norm of a parameter's update
+invariant to the application of any linear transformation to its inputs and
+outputs. I develop a variant of Iso called IsoAdam that allows optimal
+hyperparameters to be transferred from Adam, and demonstrate that IsoAdam
+obtains a speedup over Adam when training a small Transformer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Provable Benefits of Policy Learning from Human Preferences in
+  Contextual Bandit Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Ji, Huazheng Wang, Minshuo Chen, Tuo Zhao, Mengdi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A crucial task in decision-making problems is reward engineering. It is
+common in practice that no obvious choice of reward function exists. Thus, a
+popular approach is to introduce human feedback during training and leverage
+such feedback to learn a reward function. Among all policy learning methods
+that use human feedback, preference-based methods have demonstrated substantial
+success in recent empirical applications such as InstructGPT. In this work, we
+develop a theory that provably shows the benefits of preference-based methods
+in offline contextual bandits. In particular, we improve the modeling and
+suboptimality analysis for running policy learning methods on human-scored
+samples directly. Then, we compare it with the suboptimality guarantees of
+preference-based methods and show that preference-based methods enjoy lower
+suboptimality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Big Data - Supply Chain Management Framework for Forecasting: Data
+  Preprocessing and Machine Learning Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Abrar Jahin, Md Sakib Hossain Shovon, Jungpil Shin, Istiyaque Ahmed Ridoy, Yoichi Tomioka, M. F. Mridha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article intends to systematically identify and comparatively analyze
+state-of-the-art supply chain (SC) forecasting strategies and technologies. A
+novel framework has been proposed incorporating Big Data Analytics in SC
+Management (problem identification, data sources, exploratory data analysis,
+machine-learning model training, hyperparameter tuning, performance evaluation,
+and optimization), forecasting effects on human-workforce, inventory, and
+overall SC. Initially, the need to collect data according to SC strategy and
+how to collect them has been discussed. The article discusses the need for
+different types of forecasting according to the period or SC objective. The SC
+KPIs and the error-measurement systems have been recommended to optimize the
+top-performing model. The adverse effects of phantom inventory on forecasting
+and the dependence of managerial decisions on the SC KPIs for determining model
+performance parameters and improving operations management, transparency, and
+planning efficiency have been illustrated. The cyclic connection within the
+framework introduces preprocessing optimization based on the post-process KPIs,
+optimizing the overall control process (inventory management, workforce
+determination, cost, production and capacity planning). The contribution of
+this research lies in the standard SC process framework proposal, recommended
+forecasting data analysis, forecasting effects on SC performance, machine
+learning algorithms optimization followed, and in shedding light on future
+research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Connection between One-Step Regularization and Critic Regularization
+  in Reinforcement Learning <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12968v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12968v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Eysenbach, Matthieu Geist, Sergey Levine, Ruslan Salakhutdinov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As with any machine learning problem with limited data, effective offline RL
+algorithms require careful regularization to avoid overfitting. One-step
+methods perform regularization by doing just a single step of policy
+improvement, while critic regularization methods do many steps of policy
+improvement with a regularized objective. These methods appear distinct.
+One-step methods, such as advantage-weighted regression and conditional
+behavioral cloning, truncate policy iteration after just one step. This ``early
+stopping'' makes one-step RL simple and stable, but can limit its asymptotic
+performance. Critic regularization typically requires more compute but has
+appealing lower-bound guarantees. In this paper, we draw a close connection
+between these methods: applying a multi-step critic regularization method with
+a regularization coefficient of 1 yields the same policy as one-step RL. While
+practical implementations violate our assumptions and critic regularization is
+typically applied with smaller regularization coefficients, our experiments
+nevertheless show that our analysis makes accurate, testable predictions about
+practical offline RL methods (CQL and one-step RL) with commonly-used
+hyperparameters. Our results that every problem can be solved with a single
+step of policy improvement, but rather that one-step RL might be competitive
+with critic regularization on RL problems that demand strong regularization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2023. Video
+  (https://www.youtube.com/watch?v=1xlixIHZ0R4) and code
+  (https://github.com/ben-eysenbach/ac-connection)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Dense Correspondences between Photos and Sketches <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanchen Lu, Xiaolong Wang, Judith E Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans effortlessly grasp the connection between sketches and real-world
+objects, even when these sketches are far from realistic. Moreover, human
+sketch understanding goes beyond categorization -- critically, it also entails
+understanding how individual elements within a sketch correspond to parts of
+the physical world it represents. What are the computational ingredients needed
+to support this ability? Towards answering this question, we make two
+contributions: first, we introduce a new sketch-photo correspondence benchmark,
+$\textit{PSC6k}$, containing 150K annotations of 6250 sketch-photo pairs across
+125 object categories, augmenting the existing Sketchy dataset with
+fine-grained correspondence metadata. Second, we propose a self-supervised
+method for learning dense correspondences between sketch-photo pairs, building
+upon recent advances in correspondence learning for pairs of photos. Our model
+uses a spatial transformer network to estimate the warp flow between latent
+representations of a sketch and photo extracted by a contrastive learning-based
+ConvNet backbone. We found that this approach outperformed several strong
+baselines and produced predictions that were quantitatively consistent with
+other warp-based methods. However, our benchmark also revealed systematic
+differences between predictions of the suite of models we tested and those of
+humans. Taken together, our work suggests a promising path towards developing
+artificial systems that achieve more human-like understanding of visual images
+at different levels of abstraction. Project page:
+https://photo-sketch-correspondence.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2023. Project page:
+  https://photo-sketch-correspondence.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficiently Sampling the PSD Cone with the Metric Dikin Walk 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunbum Kook, Santosh S. Vempala
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-definite programs represent a frontier of efficient computation. While
+there has been much progress on semi-definite optimization, with moderate-sized
+instances currently solvable in practice by the interior-point method, the
+basic problem of sampling semi-definite solutions remains a formidable
+challenge. The direct application of known polynomial-time algorithms for
+sampling general convex bodies to semi-definite sampling leads to a
+prohibitively high running time. In addition, known general methods require an
+expensive rounding phase as pre-processing. Here we analyze the Dikin walk, by
+first adapting it to general metrics, then devising suitable metrics for the
+PSD cone with affine constraints. The resulting mixing time and per-step
+complexity are considerably smaller, and by an appropriate choice of the
+metric, the dependence on the number of constraints can be made
+polylogarithmic. We introduce a refined notion of self-concordant matrix
+functions and give rules for combining different metrics. Along the way, we
+further develop the theory of interior-point methods for sampling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>54 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Privileged and Convergent Bases in Neural Network Representations <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12941v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12941v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davis Brown, Nikhil Vyas, Yamini Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we investigate whether the representations learned by neural
+networks possess a privileged and convergent basis. Specifically, we examine
+the significance of feature directions represented by individual neurons.
+First, we establish that arbitrary rotations of neural representations cannot
+be inverted (unlike linear networks), indicating that they do not exhibit
+complete rotational invariance. Subsequently, we explore the possibility of
+multiple bases achieving identical performance. To do this, we compare the
+bases of networks trained with the same parameters but with varying random
+initializations. Our study reveals two findings: (1) Even in wide networks such
+as WideResNets, neural networks do not converge to a unique basis; (2) Basis
+correlation increases significantly when a few early layers of the network are
+frozen identically.
+  Furthermore, we analyze Linear Mode Connectivity, which has been studied as a
+measure of basis correlation. Our findings give evidence that while Linear Mode
+Connectivity improves with increased network width, this improvement is not due
+to an increase in basis correlation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In the Workshop on High-dimensional Learning Dynamics at ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contextual Bandits and Imitation Learning via Preference-Based Active
+  Queries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12926v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12926v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayush Sekhari, Karthik Sridharan, Wen Sun, Runzhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of contextual bandits and imitation learning, where
+the learner lacks direct knowledge of the executed action's reward. Instead,
+the learner can actively query an expert at each round to compare two actions
+and receive noisy preference feedback. The learner's objective is two-fold: to
+minimize the regret associated with the executed actions, while simultaneously,
+minimizing the number of comparison queries made to the expert. In this paper,
+we assume that the learner has access to a function class that can represent
+the expert's preference model under appropriate link functions, and provide an
+algorithm that leverages an online regression oracle with respect to this
+function class for choosing its actions and deciding when to query. For the
+contextual bandit setting, our algorithm achieves a regret bound that combines
+the best of both worlds, scaling as $O(\min\{\sqrt{T}, d/\Delta\})$, where $T$
+represents the number of interactions, $d$ represents the eluder dimension of
+the function class, and $\Delta$ represents the minimum preference of the
+optimal action over any suboptimal action under all contexts. Our algorithm
+does not require the knowledge of $\Delta$, and the obtained regret bound is
+comparable to what can be achieved in the standard contextual bandits setting
+where the learner observes reward signals at each round. Additionally, our
+algorithm makes only $O(\min\{T, d^2/\Delta^2\})$ queries to the expert. We
+then extend our algorithm to the imitation learning setting, where the learning
+agent engages with an unknown environment in episodes of length $H$ each, and
+provide similar guarantees for regret and query complexity. Interestingly, our
+algorithm for imitation learning can even learn to outperform the underlying
+expert, when it is suboptimal, highlighting a practical benefit of
+preference-based feedback in imitation learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QAmplifyNet: Pushing the Boundaries of Supply Chain Backorder Prediction
+  Using Interpretable Hybrid Quantum - Classical Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Abrar Jahin, Md Sakib Hossain Shovon, Md. Saiful Islam, Jungpil Shin, M. F. Mridha, Yuichi Okuyama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supply chain management relies on accurate backorder prediction for
+optimizing inventory control, reducing costs, and enhancing customer
+satisfaction. However, traditional machine-learning models struggle with
+large-scale datasets and complex relationships, hindering real-world data
+collection. This research introduces a novel methodological framework for
+supply chain backorder prediction, addressing the challenge of handling large
+datasets. Our proposed model, QAmplifyNet, employs quantum-inspired techniques
+within a quantum-classical neural network to predict backorders effectively on
+short and imbalanced datasets. Experimental evaluations on a benchmark dataset
+demonstrate QAmplifyNet's superiority over classical models, quantum ensembles,
+quantum neural networks, and deep reinforcement learning. Its proficiency in
+handling short, imbalanced datasets makes it an ideal solution for supply chain
+management. To enhance model interpretability, we use Explainable Artificial
+Intelligence techniques. Practical implications include improved inventory
+control, reduced backorders, and enhanced operational efficiency. QAmplifyNet
+seamlessly integrates into real-world supply chain management systems, enabling
+proactive decision-making and efficient resource allocation. Future work
+involves exploring additional quantum-inspired techniques, expanding the
+dataset, and investigating other supply chain applications. This research
+unlocks the potential of quantum computing in supply chain optimization and
+paves the way for further exploration of quantum-inspired machine learning
+models in supply chain management. Our framework and QAmplifyNet model offer a
+breakthrough approach to supply chain backorder prediction, providing superior
+performance and opening new avenues for leveraging quantum-inspired techniques
+in supply chain management.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Approximation Theorem and error bounds for quantum neural
+  networks and quantum reservoirs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Gonon, Antoine Jacquier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Universal approximation theorems are the foundations of classical neural
+networks, providing theoretical guarantees that the latter are able to
+approximate maps of interest. Recent results have shown that this can also be
+achieved in a quantum setting, whereby classical functions can be approximated
+by parameterised quantum circuits. We provide here precise error bounds for
+specific classes of functions and extend these results to the interesting new
+setup of randomised quantum circuits, mimicking classical reservoir neural
+networks. Our results show in particular that a quantum neural network with
+$\mathcal{O}(\varepsilon^{-2})$ weights and $\mathcal{O} (\lceil
+\log_2(\varepsilon^{-1}) \rceil)$ qubits suffices to achieve accuracy
+$\varepsilon>0$ when approximating functions with integrable Fourier transform.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 0 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anytime Model Selection in Linear Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Parnian Kassraie, Aldo Pacchiano, Nicolas Emmenegger, Andreas Krause
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model selection in the context of bandit optimization is a challenging
+problem, as it requires balancing exploration and exploitation not only for
+action selection, but also for model selection. One natural approach is to rely
+on online learning algorithms that treat different models as experts. Existing
+methods, however, scale poorly ($\text{poly}M$) with the number of models $M$
+in terms of their regret. Our key insight is that, for model selection in
+linear bandits, we can emulate full-information feedback to the online learner
+with a favorable bias-variance trade-off. This allows us to develop ALEXP,
+which has an exponentially improved ($\log M$) dependence on $M$ for its
+regret. ALEXP has anytime guarantees on its regret, and neither requires
+knowledge of the horizon $n$, nor relies on an initial purely exploratory
+stage. Our approach utilizes a novel time-uniform analysis of the Lasso,
+establishing a new connection between online learning and high-dimensional
+statistics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Statistical View of Column Subset Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12892v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12892v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anav Sood, Trevor Hastie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of selecting a small subset of representative
+variables from a large dataset. In the computer science literature, this
+dimensionality reduction problem is typically formalized as Column Subset
+Selection (CSS). Meanwhile, the typical statistical formalization is to find an
+information-maximizing set of Principal Variables. This paper shows that these
+two approaches are equivalent, and moreover, both can be viewed as maximum
+likelihood estimation within a certain semi-parametric model. Using these
+connections, we show how to efficiently (1) perform CSS using only summary
+statistics from the original dataset; (2) perform CSS in the presence of
+missing and/or censored data; and (3) select the subset size for CSS in a
+hypothesis testing framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-free Black-box Attack based on Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12872v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12872v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingwen Shao, Lingzhuang Meng, Yuanjian Qiao, Lixu Zhang, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the training data for the target model in a data-free black-box attack
+is not available, most recent schemes utilize GANs to generate data for
+training substitute model. However, these GANs-based schemes suffer from low
+training efficiency as the generator needs to be retrained for each target
+model during the substitute training process, as well as low generation
+quality. To overcome these limitations, we consider utilizing the diffusion
+model to generate data, and propose a data-free black-box attack scheme based
+on diffusion model to improve the efficiency and accuracy of substitute
+training. Despite the data generated by the diffusion model exhibits high
+quality, it presents diverse domain distributions and contains many samples
+that do not meet the discriminative criteria of the target model. To further
+facilitate the diffusion model to generate data suitable for the target model,
+we propose a Latent Code Augmentation (LCA) method to guide the diffusion model
+in generating data. With the guidance of LCA, the data generated by the
+diffusion model not only meets the discriminative criteria of the target model
+but also exhibits high diversity. By utilizing this data, it is possible to
+train substitute model that closely resemble the target model more efficiently.
+Extensive experiments demonstrate that our LCA achieves higher attack success
+rates and requires fewer query budgets compared to GANs-based schemes for
+different target models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stochastic Step-wise Feature Selection for Exponential Random Graph
+  Models (ERGMs) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Helal El-Zaatari, Fei Yu, Michael R Kosorok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Statistical analysis of social networks provides valuable insights into
+complex network interactions across various scientific disciplines. However,
+accurate modeling of networks remains challenging due to the heavy
+computational burden and the need to account for observed network dependencies.
+Exponential Random Graph Models (ERGMs) have emerged as a promising technique
+used in social network modeling to capture network dependencies by
+incorporating endogenous variables. Nevertheless, using ERGMs poses multiple
+challenges, including the occurrence of ERGM degeneracy, which generates
+unrealistic and meaningless network structures. To address these challenges and
+enhance the modeling of collaboration networks, we propose and test a novel
+approach that focuses on endogenous variable selection within ERGMs. Our method
+aims to overcome the computational burden and improve the accommodation of
+observed network dependencies, thereby facilitating more accurate and
+meaningful interpretations of network phenomena in various scientific fields.
+We conduct empirical testing and rigorous analysis to contribute to the
+advancement of statistical techniques and offer practical insights for network
+analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 6 tables and 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Real-World WebAgent with Planning, Long Context Understanding, and
+  Program Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12856v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12856v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Izzeddin Gur, Hiroki Furuta, Austin Huang, Mustafa Safdari, Yutaka Matsuo, Douglas Eck, Aleksandra Faust
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained large language models (LLMs) have recently achieved better
+generalization and sample efficiency in autonomous web navigation. However, the
+performance on real-world websites has still suffered from (1) open domainness,
+(2) limited context length, and (3) lack of inductive bias on HTML. We
+introduce WebAgent, an LLM-driven agent that can complete the tasks on real
+websites following natural language instructions. WebAgent plans ahead by
+decomposing instructions into canonical sub-instructions, summarizes long HTML
+documents into task-relevant snippets, and acts on websites via generated
+Python programs from those. We design WebAgent with Flan-U-PaLM, for grounded
+code generation, and HTML-T5, new pre-trained LLMs for long HTML documents
+using local and global attention mechanisms and a mixture of long-span
+denoising objectives, for planning and summarization. We empirically
+demonstrate that our recipe improves the success on a real website by over 50%,
+and that HTML-T5 is the best model to solve HTML-based tasks; achieving 14.9%
+higher success rate than prior SoTA on the MiniWoB web navigation benchmark and
+better accuracy on offline task planning evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early Neuron Alignment in Two-layer ReLU Networks with Small
+  Initialization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hancheng Min, René Vidal, Enrique Mallada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the problem of training a two-layer ReLU network for
+binary classification using gradient flow with small initialization. We
+consider a training dataset with well-separated input vectors: Any pair of
+input data with the same label are positively correlated, and any pair with
+different labels are negatively correlated. Our analysis shows that, during the
+early phase of training, neurons in the first layer try to align with either
+the positive data or the negative data, depending on its corresponding weight
+on the second layer. A careful analysis of the neurons' directional dynamics
+allows us to provide an $\mathcal{O}(\frac{\log n}{\sqrt{\mu}})$ upper bound on
+the time it takes for all neurons to achieve good alignment with the input
+data, where $n$ is the number of data points and $\mu$ measures how well the
+data are separated. After the early alignment phase, the loss converges to zero
+at a $\mathcal{O}(\frac{1}{t})$ rate, and the weight matrix on the first layer
+is approximately low-rank. Numerical experiments on the MNIST dataset
+illustrate our theoretical findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficiently Learning One-Hidden-Layer ReLU Networks via Schur
+  Polynomials 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12840v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12840v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilias Diakonikolas, Daniel M. Kane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of PAC learning a linear combination of $k$ ReLU
+activations under the standard Gaussian distribution on $\mathbb{R}^d$ with
+respect to the square loss. Our main result is an efficient algorithm for this
+learning task with sample and computational complexity $(dk/\epsilon)^{O(k)}$,
+where $\epsilon>0$ is the target accuracy. Prior work had given an algorithm
+for this problem with complexity $(dk/\epsilon)^{h(k)}$, where the function
+$h(k)$ scales super-polynomially in $k$. Interestingly, the complexity of our
+algorithm is near-optimal within the class of Correlational Statistical Query
+algorithms. At a high-level, our algorithm uses tensor decomposition to
+identify a subspace such that all the $O(k)$-order moments are small in the
+orthogonal directions. Its analysis makes essential use of the theory of Schur
+polynomials to show that the higher-moment error tensors are small given that
+the lower-order ones are.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Provably Robust Estimators for Inverse Problems via Jittering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12822v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12822v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anselm Krainovic, Mahdi Soltanolkotabi, Reinhard Heckel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks provide excellent performance for inverse problems such
+as denoising. However, neural networks can be sensitive to adversarial or
+worst-case perturbations. This raises the question of whether such networks can
+be trained efficiently to be worst-case robust. In this paper, we investigate
+whether jittering, a simple regularization technique that adds isotropic
+Gaussian noise during training, is effective for learning worst-case robust
+estimators for inverse problems. While well studied for prediction in
+classification tasks, the effectiveness of jittering for inverse problems has
+not been systematically investigated. In this paper, we present a novel
+analytical characterization of the optimal $\ell_2$-worst-case robust estimator
+for linear denoising and show that jittering yields optimal robust denoisers.
+Furthermore, we examine jittering empirically via training deep neural networks
+(U-nets) for natural image denoising, deconvolution, and accelerated magnetic
+resonance imaging (MRI). The results show that jittering significantly enhances
+the worst-case robustness, but can be suboptimal for inverse problems beyond
+denoising. Moreover, our results imply that training on real data which often
+contains slight noise is somewhat robustness enhancing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal Fair Machine Learning via Rank-Preserving Interventional
+  Distributions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12797v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12797v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ludwig Bothmann, Susanne Dandl, Michael Schomaker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A decision can be defined as fair if equal individuals are treated equally
+and unequals unequally. Adopting this definition, the task of designing machine
+learning models that mitigate unfairness in automated decision-making systems
+must include causal thinking when introducing protected attributes. Following a
+recent proposal, we define individuals as being normatively equal if they are
+equal in a fictitious, normatively desired (FiND) world, where the protected
+attribute has no (direct or indirect) causal effect on the target. We propose
+rank-preserving interventional distributions to define an estimand of this FiND
+world and a warping method for estimation. Evaluation criteria for both the
+method and resulting model are presented and validated through simulations and
+empirical data. With this, we show that our warping approach effectively
+identifies the most discriminated individuals and mitigates unfairness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Compact & Capable: Harnessing Graph Neural Networks and Edge Convolution
+  for Medical Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aryan Singh, Pepijn Van de Ven, Ciarán Eising, Patrick Denny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph-based neural network models are gaining traction in the field of
+representation learning due to their ability to uncover latent topological
+relationships between entities that are otherwise challenging to identify.
+These models have been employed across a diverse range of domains, encompassing
+drug discovery, protein interactions, semantic segmentation, and fluid dynamics
+research. In this study, we investigate the potential of Graph Neural Networks
+(GNNs) for medical image classification. We introduce a novel model that
+combines GNNs and edge convolution, leveraging the interconnectedness of RGB
+channel feature values to strongly represent connections between crucial graph
+nodes. Our proposed model not only performs on par with state-of-the-art Deep
+Neural Networks (DNNs) but does so with 1000 times fewer parameters, resulting
+in reduced training time and data requirements. We compare our Graph
+Convolutional Neural Network (GCNN) to pre-trained DNNs for classifying
+MedMNIST dataset classes, revealing promising prospects for GNNs in medical
+image analysis. Our results also encourage further exploration of advanced
+graph-based models such as Graph Attention Networks (GAT) and Graph
+Auto-Encoders in the medical imaging domain. The proposed model yields more
+reliable, interpretable, and accurate outcomes for tasks like semantic
+segmentation and image classification compared to simpler GCNNs
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing the Strategy of Propaganda using Inverse Reinforcement
+  Learning: Evidence from the 2022 Russian Invasion of Ukraine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12788v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12788v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominique Geissler, Stefan Feuerriegel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The 2022 Russian invasion of Ukraine was accompanied by a large-scale,
+pro-Russian propaganda campaign on social media. However, the strategy behind
+the dissemination of propaganda has remained unclear, particularly how the
+online discourse was strategically shaped by the propagandists' community.
+Here, we analyze the strategy of the Twitter community using an inverse
+reinforcement learning (IRL) approach. Specifically, IRL allows us to model
+online behavior as a Markov decision process, where the goal is to infer the
+underlying reward structure that guides propagandists when interacting with
+users with a supporting or opposing stance toward the invasion. Thereby, we aim
+to understand empirically whether and how between-user interactions are
+strategically used to promote the proliferation of Russian propaganda. For
+this, we leverage a large-scale dataset with 349,455 posts with pro-Russian
+propaganda from 132,131 users. We show that bots and humans follow a different
+strategy: bots respond predominantly to pro-invasion messages, suggesting that
+they seek to drive virality; while messages indicating opposition primarily
+elicit responses from humans, suggesting that they tend to engage in critical
+discussions. To the best of our knowledge, this is the first study analyzing
+the strategy behind propaganda from the 2022 Russian invasion of Ukraine
+through the lens of IRL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is attention all you need in medical image analysis? A <span class="highlight-title">review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12775v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12775v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgos Papanastasiou, Nikolaos Dikaios, Jiahao Huang, Chengjia Wang, Guang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical imaging is a key component in clinical diagnosis, treatment planning
+and clinical trial design, accounting for almost 90% of all healthcare data.
+CNNs achieved performance gains in medical image analysis (MIA) over the last
+years. CNNs can efficiently model local pixel interactions and be trained on
+small-scale MI data. The main disadvantage of typical CNN models is that they
+ignore global pixel relationships within images, which limits their
+generalisation ability to understand out-of-distribution data with different
+'global' information. The recent progress of Artificial Intelligence gave rise
+to Transformers, which can learn global relationships from data. However, full
+Transformer models need to be trained on large-scale data and involve
+tremendous computational complexity. Attention and Transformer compartments
+(Transf/Attention) which can well maintain properties for modelling global
+relationships, have been proposed as lighter alternatives of full Transformers.
+Recently, there is an increasing trend to co-pollinate complementary
+local-global properties from CNN and Transf/Attention architectures, which led
+to a new era of hybrid models. The past years have witnessed substantial growth
+in hybrid CNN-Transf/Attention models across diverse MIA problems. In this
+systematic review, we survey existing hybrid CNN-Transf/Attention models,
+review and unravel key architectural designs, analyse breakthroughs, and
+evaluate current and future opportunities as well as challenges. We also
+introduced a comprehensive analysis framework on generalisation opportunities
+of scientific and clinical impact, based on which new data-driven domain
+generalisation and adaptation methods can be stimulated.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting disturbances in network-coupled dynamical systems with machine
+  learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12771v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12771v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Per Sebastian Skardal, Juan G. Restrepo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying disturbances in network-coupled dynamical systems without
+knowledge of the disturbances or underlying dynamics is a problem with a wide
+range of applications. For example, one might want to know which nodes in the
+network are being disturbed and identify the type of disturbance. Here we
+present a model-free method based on machine learning to identify such unknown
+disturbances based only on prior observations of the system when forced by a
+known training function. We find that this method is able to identify the
+locations and properties of many different types of unknown disturbances using
+a variety of known forcing functions. We illustrate our results both with
+linear and nonlinear disturbances using food web and neuronal activity models.
+Finally, we discuss how to scale our method to large networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonparametric Linear Feature Learning in Regression Through
+  Regularisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12754v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12754v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bertille Follain, Umut Simsekli, Francis Bach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representation learning plays a crucial role in automated feature selection,
+particularly in the context of high-dimensional data, where non-parametric
+methods often struggle. In this study, we focus on supervised learning
+scenarios where the pertinent information resides within a lower-dimensional
+linear subspace of the data, namely the multi-index model. If this subspace
+were known, it would greatly enhance prediction, computation, and
+interpretation. To address this challenge, we propose a novel method for linear
+feature learning with non-parametric prediction, which simultaneously estimates
+the prediction function and the linear subspace. Our approach employs empirical
+risk minimisation, augmented with a penalty on function derivatives, ensuring
+versatility. Leveraging the orthogonality and rotation invariance properties of
+Hermite polynomials, we introduce our estimator, named RegFeaL. By utilising
+alternative minimisation, we iteratively rotate the data to improve alignment
+with leading directions and accurately estimate the relevant dimension in
+practical settings. We establish that our method yields a consistent estimator
+of the prediction function with explicit rates. Additionally, we provide
+empirical results demonstrating the performance of RegFeaL in various
+experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages, 16 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Concept-based explainability for an EEG <span class="highlight-title">transformer</span> model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12745v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12745v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anders Gjølbye Madsen, William Theodor Lehn-Schiøler, Áshildur Jónsdóttir, Bergdís Arnardóttir, Lars Kai Hansen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models are complex due to their size, structure, and inherent
+randomness in training procedures. Additional complexity arises from the
+selection of datasets and inductive biases. Addressing these challenges for
+explainability, Kim et al. (2018) introduced Concept Activation Vectors (CAVs),
+which aim to understand deep models' internal states in terms of human-aligned
+concepts. These concepts correspond to directions in latent space, identified
+using linear discriminants. Although this method was first applied to image
+classification, it was later adapted to other domains, including natural
+language processing. In this work, we attempt to apply the method to
+electroencephalogram (EEG) data for explainability in Kostas et al.'s BENDR
+(2021), a large-scale transformer model. A crucial part of this endeavor
+involves defining the explanatory concepts and selecting relevant datasets to
+ground concepts in the latent space. Our focus is on two mechanisms for EEG
+concept formation: the use of externally labeled EEG datasets, and the
+application of anatomically defined concepts. The former approach is a
+straightforward generalization of methods used in image classification, while
+the latter is novel and specific to EEG. We present evidence that both
+approaches to concept formation yield valuable insights into the
+representations learned by deep EEG models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in proceedings of 2023 IEEE International workshop on
+  Machine Learning for Signal Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safety Performance of Neural Networks in the Presence of Covariate Shift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12716v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12716v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chih-Hong Cheng, Harald Ruess, Konstantinos Theodorou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Covariate shift may impact the operational safety performance of neural
+networks. A re-evaluation of the safety performance, however, requires
+collecting new operational data and creating corresponding ground truth labels,
+which often is not possible during operation. We are therefore proposing to
+reshape the initial test set, as used for the safety performance evaluation
+prior to deployment, based on an approximation of the operational data. This
+approximation is obtained by observing and learning the distribution of
+activation patterns of neurons in the network during operation. The reshaped
+test set reflects the distribution of neuron activation values as observed
+during operation, and may therefore be used for re-evaluating safety
+performance in the presence of covariate shift. First, we derive conservative
+bounds on the values of neurons by applying finite binning and static dataflow
+analysis. Second, we formulate a mixed integer linear programming (MILP)
+constraint for constructing the minimum set of data points to be removed in the
+test set, such that the difference between the discretized test and operational
+distributions is bounded. We discuss potential benefits and limitations of this
+constraint-based approach based on our initial experience with an implemented
+research prototype.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Policy Gradient Optimal Correlation Search for Variance Reduction in
+  Monte Carlo simulation and Maximum Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12703v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12703v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Bras, Gilles Pagès
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new algorithm for variance reduction when estimating $f(X_T)$
+where $X$ is the solution to some stochastic differential equation and $f$ is a
+test function. The new estimator is $(f(X^1_T) + f(X^2_T))/2$, where $X^1$ and
+$X^2$ have same marginal law as $X$ but are pathwise correlated so that to
+reduce the variance. The optimal correlation function $\rho$ is approximated by
+a deep neural network and is calibrated along the trajectories of $(X^1, X^2)$
+by policy gradient and reinforcement learning techniques. Finding an optimal
+coupling given marginal laws has links with maximum optimal transport.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> MC-JEPA: A Joint-Embedding Predictive Architecture for <span class="highlight-title">Self-Supervised</span>
+  Learning of Motion and Content Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrien Bardes, Jean Ponce, <span class="highlight-author">Yann LeCun</span>
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning of visual representations has been focusing on
+learning content features, which do not capture object motion or location, and
+focus on identifying and differentiating objects in images and videos. On the
+other hand, optical flow estimation is a task that does not involve
+understanding the content of the images on which it is estimated. We unify the
+two approaches and introduce MC-JEPA, a joint-embedding predictive architecture
+and self-supervised learning approach to jointly learn optical flow and content
+features within a shared encoder, demonstrating that the two associated
+objectives; the optical flow estimation objective and the self-supervised
+learning objective; benefit from each other and thus learn content features
+that incorporate motion information. The proposed approach achieves performance
+on-par with existing unsupervised optical flow benchmarks, as well as with
+common self-supervised learning approaches on downstream tasks such as semantic
+segmentation of images and videos.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Addressing the Impact of Localized Training Data in Graph Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Singh Akansha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have achieved notable success in learning from
+graph-structured data, owing to their ability to capture intricate dependencies
+and relationships between nodes. They excel in various applications, including
+semi-supervised node classification, link prediction, and graph generation.
+However, it is important to acknowledge that the majority of state-of-the-art
+GNN models are built upon the assumption of an in-distribution setting, which
+hinders their performance on real-world graphs with dynamic structures. In this
+article, we aim to assess the impact of training GNNs on localized subsets of
+the graph. Such restricted training data may lead to a model that performs well
+in the specific region it was trained on but fails to generalize and make
+accurate predictions for the entire graph. In the context of graph-based
+semi-supervised learning (SSL), resource constraints often lead to scenarios
+where the dataset is large, but only a portion of it can be labeled, affecting
+the model's performance. This limitation affects tasks like anomaly detection
+or spam detection when labeling processes are biased or influenced by human
+subjectivity. To tackle the challenges posed by localized training data, we
+approach the problem as an out-of-distribution (OOD) data issue by by aligning
+the distributions between the training data, which represents a small portion
+of labeled data, and the graph inference process that involves making
+predictions for the entire graph. We propose a regularization method to
+minimize distributional discrepancies between localized training data and graph
+inference, improving model performance on OOD data. Extensive tests on popular
+GNN models show significant performance improvement on three citation GNN
+benchmark datasets. The regularization approach effectively enhances model
+adaptation and generalization, overcoming challenges posed by OOD data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Estimator for the Sensitivity to Perturbations of Deep Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12679v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12679v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naman Maheshwari, Nicholas Malaya, Scott Moe, Jaydeep P. Kulkarni, Sudhanva Gurumurthi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For Deep Neural Networks (DNNs) to become useful in safety-critical
+applications, such as self-driving cars and disease diagnosis, they must be
+stable to perturbations in input and model parameters. Characterizing the
+sensitivity of a DNN to perturbations is necessary to determine minimal
+bit-width precision that may be used to safely represent the network. However,
+no general result exists that is capable of predicting the sensitivity of a
+given DNN to round-off error, noise, or other perturbations in input. This
+paper derives an estimator that can predict such quantities. The estimator is
+derived via inequalities and matrix norms, and the resulting quantity is
+roughly analogous to a condition number for the entire neural network. An
+approximation of the estimator is tested on two Convolutional Neural Networks,
+AlexNet and VGG-19, using the ImageNet dataset. For each of these networks, the
+tightness of the estimator is explored via random perturbations and adversarial
+attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Actual work and paper concluded in January 2019</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked
+  Image Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhen Pan, Suprosanna Shit, Özgün Turgut, Wenqi Huang, Hongwei Bran Li, Nil Stolt-Ansó, Thomas Küstner, Kerstin Hammernik, Daniel Rueckert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In dynamic Magnetic Resonance Imaging (MRI), k-space is typically
+undersampled due to limited scan time, resulting in aliasing artifacts in the
+image domain. Hence, dynamic MR reconstruction requires not only modeling
+spatial frequency components in the x and y directions of k-space but also
+considering temporal redundancy. Most previous works rely on image-domain
+regularizers (priors) to conduct MR reconstruction. In contrast, we focus on
+interpolating the undersampled k-space before obtaining images with Fourier
+transform. In this work, we connect masked image modeling with k-space
+interpolation and propose a novel Transformer-based k-space Global
+Interpolation Network, termed k-GIN. Our k-GIN learns global dependencies among
+low- and high-frequency components of 2D+t k-space and uses it to interpolate
+unsampled data. Further, we propose a novel k-space Iterative Refinement Module
+(k-IRM) to enhance the high-frequency components learning. We evaluate our
+approach on 92 in-house 2D+t cardiac MR subjects and compare it to MR
+reconstruction methods with image-domain regularizers. Experiments show that
+our proposed k-space interpolation method quantitatively and qualitatively
+outperforms baseline methods. Importantly, the proposed approach achieves
+substantially higher robustness and generalizability in cases of
+highly-undersampled MR data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TransFusion: Generating Long, High Fidelity Time Series using Diffusion
+  Models with <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Fahim Sikder, Resmi Ramachandranpillai, Fredrik Heintz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generation of high-quality, long-sequenced time-series data is essential
+due to its wide range of applications. In the past, standalone Recurrent and
+Convolutional Neural Network-based Generative Adversarial Networks (GAN) were
+used to synthesize time-series data. However, they are inadequate for
+generating long sequences of time-series data due to limitations in the
+architecture. Furthermore, GANs are well known for their training instability
+and mode collapse problem. To address this, we propose TransFusion, a
+diffusion, and transformers-based generative model to generate high-quality
+long-sequence time-series data. We have stretched the sequence length to 384,
+and generated high-quality synthetic data. To the best of our knowledge, this
+is the first study that has been done with this long-sequence length. Also, we
+introduce two evaluation metrics to evaluate the quality of the synthetic data
+as well as its predictive characteristics. We evaluate TransFusion with a wide
+variety of visual and empirical metrics, and TransFusion outperforms the
+previous state-of-the-art by a significant margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Continual Learning in Keyword Spotting for Low-Resource Devices
+  via Pooling High-Order Temporal Statistics <span class="chip">INTERSPEECH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12660v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12660v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Umberto Michieli, Pablo Peso Parada, Mete Ozay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Keyword Spotting (KWS) models on embedded devices should adapt fast to new
+user-defined words without forgetting previous ones. Embedded devices have
+limited storage and computational resources, thus, they cannot save samples or
+update large models. We consider the setup of embedded online continual
+learning (EOCL), where KWS models with frozen backbone are trained to
+incrementally recognize new words from a non-repeated stream of samples, seen
+one at a time. To this end, we propose Temporal Aware Pooling (TAP) which
+constructs an enriched feature space computing high-order moments of speech
+features extracted by a pre-trained backbone. Our method, TAP-SLDA, updates a
+Gaussian model for each class on the enriched feature space to effectively use
+audio representations. In experimental analyses, TAP-SLDA outperforms
+competitors on several setups, backbones, and baselines, bringing a relative
+average gain of 11.3% on the GSC dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>INTERSPEECH 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation
+  of rPPG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12644v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12644v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dae Yeol Kim, Eunsu Goh, KwangKee Lee, JongEui Chae, JongHyeon Mun, Junyeong Na, Chae-bong Sohn, Do-Yup Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote Photoplethysmography (rPPG) is a technology that utilizes the light
+absorption properties of hemoglobin, captured via camera, to analyze and
+measure blood volume pulse (BVP). By analyzing the measured BVP, various
+physiological signals such as heart rate, stress levels, and blood pressure can
+be derived, enabling applications such as the early prediction of
+cardiovascular diseases. rPPG is a rapidly evolving field as it allows the
+measurement of vital signals using camera-equipped devices without the need for
+additional devices such as blood pressure monitors or pulse oximeters, and
+without the assistance of medical experts. Despite extensive efforts and
+advances in this field, serious challenges remain, including issues related to
+skin color, camera characteristics, ambient lighting, and other sources of
+noise, which degrade performance accuracy. We argue that fair and evaluable
+benchmarking is urgently required to overcome these challenges and make any
+meaningful progress from both academic and commercial perspectives. In most
+existing work, models are trained, tested, and validated only on limited
+datasets. Worse still, some studies lack available code or reproducibility,
+making it difficult to fairly evaluate and compare performance. Therefore, the
+purpose of this study is to provide a benchmarking framework to evaluate
+various rPPG techniques across a wide range of datasets for fair evaluation and
+comparison, including both conventional non-deep neural network (non-DNN) and
+deep neural network (DNN) methods. GitHub URL:
+https://github.com/remotebiosensing/rppg.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fake News Detection Through Graph-based Neural Networks: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12639v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12639v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuzhi Gong, Richard O. Sinnott, Jianzhong Qi, Cecile Paris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The popularity of online social networks has enabled rapid dissemination of
+information. People now can share and consume information much more rapidly
+than ever before. However, low-quality and/or accidentally/deliberately fake
+information can also spread rapidly. This can lead to considerable and negative
+impacts on society. Identifying, labelling and debunking online misinformation
+as early as possible has become an increasingly urgent problem. Many methods
+have been proposed to detect fake news including many deep learning and
+graph-based approaches. In recent years, graph-based methods have yielded
+strong results, as they can closely model the social context and propagation
+process of online news. In this paper, we present a systematic review of fake
+news detection studies based on graph-based and deep learning-based techniques.
+We classify existing graph-based methods into knowledge-driven methods,
+propagation-based methods, and heterogeneous social context-based methods,
+depending on how a graph structure is constructed to model news related
+information flows. We further discuss the challenges and open problems in
+graph-based fake news detection and identify future research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 3 tables, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identifying drivers and mitigators for congestion and redispatch in the
+  German electric power system with explainable AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12636v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12636v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maurizio Titz, Sebastian Pütz, Dirk Witthaut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The transition to a sustainable energy supply challenges the operation of
+electric power systems in manifold ways. Transmission grid loads increase as
+wind and solar power are often installed far away from the consumers. In
+extreme cases, system operators must intervene via countertrading or redispatch
+to ensure grid stability. In this article, we provide a data-driven analysis of
+congestion in the German transmission grid. We develop an explainable machine
+learning model to predict the volume of redispatch and countertrade on an
+hourly basis. The model reveals factors that drive or mitigate grid congestion
+and quantifies their impact. We show that, as expected, wind power generation
+is the main driver, but hydropower and cross-border electricity trading also
+play an essential role. Solar power, on the other hand, has no mitigating
+effect. Our results suggest that a change to the market design would alleviate
+congestion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ De-confounding Representation Learning for Counterfactual Inference on
+  Continuous Treatment via Generative Adversarial Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12625v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12625v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yonghe Zhao, Qiang Huang, Haolong Zeng, Yun Pen, Huiyan Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactual inference for continuous rather than binary treatment
+variables is more common in real-world causal inference tasks. While there are
+already some sample reweighting methods based on Marginal Structural Model for
+eliminating the confounding bias, they generally focus on removing the
+treatment's linear dependence on confounders and rely on the accuracy of the
+assumed parametric models, which are usually unverifiable. In this paper, we
+propose a de-confounding representation learning (DRL) framework for
+counterfactual outcome estimation of continuous treatment by generating the
+representations of covariates disentangled with the treatment variables. The
+DRL is a non-parametric model that eliminates both linear and nonlinear
+dependence between treatment and covariates. Specifically, we train the
+correlations between the de-confounded representations and the treatment
+variables against the correlations between the covariate representations and
+the treatment variables to eliminate confounding bias. Further, a
+counterfactual inference network is embedded into the framework to make the
+learned representations serve both de-confounding and trusted inference.
+Extensive experiments on synthetic datasets show that the DRL model performs
+superiorly in learning de-confounding representations and outperforms
+state-of-the-art counterfactual inference models for continuous treatment
+variables. In addition, we apply the DRL model to a real-world medical dataset
+MIMIC and demonstrate a detailed causal relationship between red cell width
+distribution and mortality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages,4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting Ordinary Differential Equations with <span class="highlight-title">Transformer</span>s <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12617v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12617v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sören Becker, Michal Klein, Alexander Neitz, Giambattista Parascandolo, Niki Kilbertus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a transformer-based sequence-to-sequence model that recovers
+scalar ordinary differential equations (ODEs) in symbolic form from irregularly
+sampled and noisy observations of a single solution trajectory. We demonstrate
+in extensive empirical evaluations that our model performs better or on par
+with existing methods in terms of accurate recovery across various settings.
+Moreover, our method is efficiently scalable: after one-time pretraining on a
+large set of ODEs, we can infer the governing law of a new observed solution in
+a few forward passes of the model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ExWarp: Extrapolation and Warping-based Temporal Supersampling for
+  High-frequency Displays 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12607v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12607v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akanksha Dixit, Yashashwee Chakrabarty, Smruti R. Sarangi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-frequency displays are gaining immense popularity because of their
+increasing use in video games and virtual reality applications. However, the
+issue is that the underlying GPUs cannot continuously generate frames at this
+high rate -- this results in a less smooth and responsive experience.
+Furthermore, if the frame rate is not synchronized with the refresh rate, the
+user may experience screen tearing and stuttering. Previous works propose
+increasing the frame rate to provide a smooth experience on modern displays by
+predicting new frames based on past or future frames. Interpolation and
+extrapolation are two widely used algorithms that predict new frames.
+Interpolation requires waiting for the future frame to make a prediction, which
+adds additional latency. On the other hand, extrapolation provides a better
+quality of experience because it relies solely on past frames -- it does not
+incur any additional latency. The simplest method to extrapolate a frame is to
+warp the previous frame using motion vectors; however, the warped frame may
+contain improperly rendered visual artifacts due to dynamic objects -- this
+makes it very challenging to design such a scheme. Past work has used DNNs to
+get good accuracy, however, these approaches are slow. This paper proposes
+Exwarp -- an approach based on reinforcement learning (RL) to intelligently
+choose between the slower DNN-based extrapolation and faster warping-based
+methods to increase the frame rate by 4x with an almost negligible reduction in
+the perceived image quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Concept backpropagation: An Explainable AI approach for visualising
+  learned concepts in neural network models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12601v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12601v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrik Hammersborg, Inga Strümke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural network models are widely used in a variety of domains, often as
+black-box solutions, since they are not directly interpretable for humans. The
+field of explainable artificial intelligence aims at developing explanation
+methods to address this challenge, and several approaches have been developed
+over the recent years, including methods for investigating what type of
+knowledge these models internalise during the training process. Among these,
+the method of concept detection, investigates which \emph{concepts} neural
+network models learn to represent in order to complete their tasks. In this
+work, we present an extension to the method of concept detection, named
+\emph{concept backpropagation}, which provides a way of analysing how the
+information representing a given concept is internalised in a given neural
+network model. In this approach, the model input is perturbed in a manner
+guided by a trained concept probe for the described model, such that the
+concept of interest is maximised. This allows for the visualisation of the
+detected concept directly in the input space of the model, which in turn makes
+it possible to see what information the model depends on for representing the
+described concept. We present results for this method applied to a various set
+of input modalities, and discuss how our proposed method can be used to
+visualise what information trained concept probes use, and the degree as to
+which the representation of the probed concept is entangled within the neural
+network model itself.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimized data collection and analysis process for studying
+  solar-thermal desalination by machine learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12594v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12594v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guilong Peng, Senshan Sun, Yangjun Qin, Zhenwei Xu, Juxin Du, Swellam W. sharshir, A. W. Kandel, A. E. Kabeel, Nuo Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An effective interdisciplinary study between machine learning and
+solar-thermal desalination requires a sufficiently large and well-analyzed
+experimental datasets. This study develops a modified dataset collection and
+analysis process for studying solar-thermal desalination by machine learning.
+Based on the optimized water condensation and collection process, the proposed
+experimental method collects over one thousand datasets, which is ten times
+more than the average number of datasets in previous works, by accelerating
+data collection and reducing the time by 83.3%. On the other hand, the effects
+of dataset features are investigated by using three different algorithms,
+including artificial neural networks, multiple linear regressions, and random
+forests. The investigation focuses on the effects of dataset size and range on
+prediction accuracy, factor importance ranking, and the model's generalization
+ability. The results demonstrate that a larger dataset can significantly
+improve prediction accuracy when using artificial neural networks and random
+forests. Additionally, the study highlights the significant impact of dataset
+size and range on ranking the importance of influence factors. Furthermore, the
+study reveals that the extrapolation data range significantly affects the
+extrapolation accuracy of artificial neural networks. Based on the results,
+massive dataset collection and analysis of dataset feature effects are
+important steps in an effective and consistent machine learning process flow
+for solar-thermal desalination, which can promote machine learning as a more
+general tool in the field of solar-thermal desalination.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InVAErt networks: a data-driven framework for emulation, inference and
+  identifiability analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoxiang Grayson Tong, Carlos A. Sing Long, Daniele E. Schiavazzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Use of generative models and deep learning for physics-based systems is
+currently dominated by the task of emulation. However, the remarkable
+flexibility offered by data-driven architectures would suggest to extend this
+representation to other aspects of system synthesis including model inversion
+and identifiability. We introduce inVAErt (pronounced \emph{invert}) networks,
+a comprehensive framework for data-driven analysis and synthesis of parametric
+physical systems which uses a deterministic encoder and decoder to represent
+the forward and inverse solution maps, normalizing flow to capture the
+probabilistic distribution of system outputs, and a variational encoder
+designed to learn a compact latent representation for the lack of bijectivity
+between inputs and outputs. We formally investigate the selection of penalty
+coefficients in the loss function and strategies for latent space sampling,
+since we find that these significantly affect both training and testing
+performance. We validate our framework through extensive numerical examples,
+including simple linear, nonlinear, and periodic maps, dynamical systems, and
+spatio-temporal PDEs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-refining of Pseudo Labels for Music Source Separation with Noisy
+  Labeled Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12576v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12576v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junghyun Koo, Yunkee Chae, Chang-Bin Jeon, Kyogu Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music source separation (MSS) faces challenges due to the limited
+availability of correctly-labeled individual instrument tracks. With the push
+to acquire larger datasets to improve MSS performance, the inevitability of
+encountering mislabeled individual instrument tracks becomes a significant
+challenge to address. This paper introduces an automated technique for refining
+the labels in a partially mislabeled dataset. Our proposed self-refining
+technique, employed with a noisy-labeled dataset, results in only a 1% accuracy
+degradation in multi-label instrument recognition compared to a classifier
+trained on a clean-labeled dataset. The study demonstrates the importance of
+refining noisy-labeled data in MSS model training and shows that utilizing the
+refined dataset leads to comparable results derived from a clean-labeled
+dataset. Notably, upon only access to a noisy dataset, MSS models trained on a
+self-refined dataset even outperform those trained on a dataset refined with a
+classifier trained on clean labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24th International Society for Music Information Retrieval Conference
+  (ISMIR 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generalising Neural Topical Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12564v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12564v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohao Yang, He Zhao, Dinh Phung, Lan Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Topic models have evolved from conventional Bayesian probabilistic models to
+Neural Topic Models (NTMs) over the last two decays. Although NTMs have
+achieved promising performance when trained and tested on a specific corpus,
+their generalisation ability across corpora is rarely studied. In practice, we
+often expect that an NTM trained on a source corpus can still produce quality
+topical representation for documents in a different target corpus without
+retraining. In this work, we aim to improve NTMs further so that their benefits
+generalise reliably across corpora and tasks. To do so, we propose to model
+similar documents by minimising their semantical distance when training NTMs.
+Specifically, similar documents are created by data augmentation during
+training; The semantical distance between documents is measured by the
+Hierarchical Topic Transport Distance (HOTT), which computes the Optimal
+Transport (OT) distance between the topical representations. Our framework can
+be readily applied to most NTMs as a plug-and-play module. Extensive
+experiments show that our framework significantly improves the generalisation
+ability regarding neural topical representation across corpora.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Homophily-Driven Sanitation View for Robust Graph Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yulin Zhu, Xing Ai, Yevgeniy Vorobeychik, Kai Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate adversarial robustness of unsupervised Graph Contrastive
+Learning (GCL) against structural attacks. First, we provide a comprehensive
+empirical and theoretical analysis of existing attacks, revealing how and why
+they downgrade the performance of GCL. Inspired by our analytic results, we
+present a robust GCL framework that integrates a homophily-driven sanitation
+view, which can be learned jointly with contrastive learning. A key challenge
+this poses, however, is the non-differentiable nature of the sanitation
+objective. To address this challenge, we propose a series of techniques to
+enable gradient-based end-to-end robust GCL. Moreover, we develop a fully
+unsupervised hyperparameter tuning method which, unlike prior approaches, does
+not require knowledge of node labels. We conduct extensive experiments to
+evaluate the performance of our proposed model, GCHS (Graph Contrastive
+Learning with Homophily-driven Sanitation View), against two state of the art
+structural attacks on GCL. Our results demonstrate that GCHS consistently
+outperforms all state of the art baselines in terms of the quality of generated
+node embeddings as well as performance on two important downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continuation Path Learning for Homotopy Optimization <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12551v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12551v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Lin, Zhiyuan Yang, Xiaoyuan Zhang, Qingfu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Homotopy optimization is a traditional method to deal with a complicated
+optimization problem by solving a sequence of easy-to-hard surrogate
+subproblems. However, this method can be very sensitive to the continuation
+schedule design and might lead to a suboptimal solution to the original
+problem. In addition, the intermediate solutions, often ignored by classic
+homotopy optimization, could be useful for many real-world applications. In
+this work, we propose a novel model-based approach to learn the whole
+continuation path for homotopy optimization, which contains infinite
+intermediate solutions for any surrogate subproblems. Rather than the classic
+unidirectional easy-to-hard optimization, our method can simultaneously
+optimize the original problem and all surrogate subproblems in a collaborative
+manner. The proposed model also supports real-time generation of any
+intermediate solution, which could be desirable for many applications.
+Experimental studies on different problems show that our proposed method can
+significantly improve the performance of homotopy optimization and provide
+extra helpful information to support better decision-making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 40th International Conference on Machine Learning
+  (ICML 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Connection between <span class="highlight-title">Pre-train</span>ing Data Diversity and Fine-tuning
+  Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivek Ramanujan, Thao Nguyen, Sewoong Oh, Ludwig Schmidt, Ali Farhadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-training has been widely adopted in deep learning to improve model
+performance, especially when the training data for a target task is limited. In
+our work, we seek to understand the implications of this training strategy on
+the generalization properties of downstream models. More specifically, we ask
+the following question: how do properties of the pre-training distribution
+affect the robustness of a fine-tuned model? The properties we explore include
+the label space, label semantics, image diversity, data domains, and data
+quantity of the pre-training distribution. We find that the primary factor
+influencing downstream effective robustness (Taori et al., 2020) is data
+quantity, while other factors have limited significance. For example, reducing
+the number of ImageNet pre-training classes by 4x while increasing the number
+of images per class by 4x (that is, keeping total data quantity fixed) does not
+impact the robustness of fine-tuned models. We demonstrate our findings on
+pre-training distributions drawn from various natural and synthetic data
+sources, primarily using the iWildCam-WILDS distribution shift as a test for
+downstream robustness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Medical Report Generation: Disease Revealing Enhancement with
+  Knowledge Graph 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixin Wang, Zihao Lin, Haoyu Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Graph (KG) plays a crucial role in Medical Report Generation (MRG)
+because it reveals the relations among diseases and thus can be utilized to
+guide the generation process. However, constructing a comprehensive KG is
+labor-intensive and its applications on the MRG process are under-explored. In
+this study, we establish a complete KG on chest X-ray imaging that includes 137
+types of diseases and abnormalities. Based on this KG, we find that the current
+MRG data sets exhibit a long-tailed problem in disease distribution. To
+mitigate this problem, we introduce a novel augmentation strategy that enhances
+the representation of disease types in the tail-end of the distribution. We
+further design a two-stage MRG approach, where a classifier is first trained to
+detect whether the input images exhibit any abnormalities. The classified
+images are then independently fed into two transformer-based generators,
+namely, ``disease-specific generator" and ``disease-free generator" to generate
+the corresponding reports. To enhance the clinical evaluation of whether the
+generated reports correctly describe the diseases appearing in the input image,
+we propose diverse sensitivity (DS), a new metric that checks whether generated
+diseases match ground truth and measures the diversity of all generated
+diseases. Results show that the proposed two-stage generation framework and
+augmentation strategies improve DS by a considerable margin, indicating a
+notable reduction in the long-tailed problem associated with under-represented
+diseases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Landslide Surface Displacement Prediction Based on VSXC-LSTM Algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12524v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12524v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Menglin Kong, Ruichen Li, Fan Liu, Xingquan Li, Juan Cheng, Muzhou Hou, Cong Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Landslide is a natural disaster that can easily threaten local ecology,
+people's lives and property. In this paper, we conduct modelling research on
+real unidirectional surface displacement data of recent landslides in the
+research area and propose a time series prediction framework named
+VMD-SegSigmoid-XGBoost-ClusterLSTM (VSXC-LSTM) based on variational mode
+decomposition, which can predict the landslide surface displacement more
+accurately. The model performs well on the test set. Except for the random item
+subsequence that is hard to fit, the root mean square error (RMSE) and the mean
+absolute percentage error (MAPE) of the trend item subsequence and the periodic
+item subsequence are both less than 0.1, and the RMSE is as low as 0.006 for
+the periodic item prediction module based on XGBoost\footnote{Accepted in
+ICANN2023}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lost In Translation: Generating Adversarial Examples Robust to
+  Round-Trip Translation <span class="chip">ICASSP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neel Bhandari, Pin-Yu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language Models today provide a high accuracy across a large number of
+downstream tasks. However, they remain susceptible to adversarial attacks,
+particularly against those where the adversarial examples maintain considerable
+similarity to the original text. Given the multilingual nature of text, the
+effectiveness of adversarial examples across translations and how machine
+translations can improve the robustness of adversarial examples remain largely
+unexplored. In this paper, we present a comprehensive study on the robustness
+of current text adversarial attacks to round-trip translation. We demonstrate
+that 6 state-of-the-art text-based adversarial attacks do not maintain their
+efficacy after round-trip translation. Furthermore, we introduce an
+intervention-based solution to this problem, by integrating Machine Translation
+into the process of adversarial example generation and demonstrating increased
+robustness to round-trip translation. Our results indicate that finding
+adversarial examples robust to translation can help identify the insufficiency
+of language models that is common across languages, and motivate further
+research into multilingual adversarial attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at International Conference on Acoustics, Speech, and
+  Signal Processing (ICASSP) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DEPHN: Different Expression Parallel Heterogeneous Network using virtual
+  gradient optimization for Multi-task Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12519v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12519v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Menglin Kong, Ri Su, Shaojie Zhao, Muzhou Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommendation system algorithm based on multi-task learning (MTL) is the
+major method for Internet operators to understand users and predict their
+behaviors in the multi-behavior scenario of platform. Task correlation is an
+important consideration of MTL goals, traditional models use shared-bottom
+models and gating experts to realize shared representation learning and
+information differentiation. However, The relationship between real-world tasks
+is often more complex than existing methods do not handle properly sharing
+information. In this paper, we propose an Different Expression Parallel
+Heterogeneous Network (DEPHN) to model multiple tasks simultaneously. DEPHN
+constructs the experts at the bottom of the model by using different feature
+interaction methods to improve the generalization ability of the shared
+information flow. In view of the model's differentiating ability for different
+task information flows, DEPHN uses feature explicit mapping and virtual
+gradient coefficient for expert gating during the training process, and
+adaptively adjusts the learning intensity of the gated unit by considering the
+difference of gating values and task correlation. Extensive experiments on
+artificial and real-world datasets demonstrate that our proposed method can
+capture task correlation in complex situations and achieve better performance
+than baseline models\footnote{Accepted in IJCNN2023}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FaFCNN: A General Disease Classification Framework Based on Feature
+  Fusion Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Menglin Kong, Shaojie Zhao, Juan Cheng, Xingquan Li, Ri Su, Muzhou Hou, Cong Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There are two fundamental problems in applying deep learning/machine learning
+methods to disease classification tasks, one is the insufficient number and
+poor quality of training samples; another one is how to effectively fuse
+multiple source features and thus train robust classification models. To
+address these problems, inspired by the process of human learning knowledge, we
+propose the Feature-aware Fusion Correlation Neural Network (FaFCNN), which
+introduces a feature-aware interaction module and a feature alignment module
+based on domain adversarial learning. This is a general framework for disease
+classification, and FaFCNN improves the way existing methods obtain sample
+correlation features. The experimental results show that training using
+augmented features obtained by pre-training gradient boosting decision tree
+yields more performance gains than random-forest based methods. On the
+low-quality dataset with a large amount of missing data in our setup, FaFCNN
+obtains a consistently optimal performance compared to competitive baselines.
+In addition, extensive experiments demonstrate the robustness of the proposed
+method and the effectiveness of each component of the model\footnote{Accepted
+in IEEE SMC2023}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Evaluation of Temporal Graph Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Le Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we conduct an empirical evaluation of Temporal Graph Benchmark
+(TGB) by extending our Dynamic Graph Library (DyGLib) to TGB. Compared with
+TGB, we include eleven popular dynamic graph learning methods for more
+exhaustive comparisons. Through the experiments, we find that (1) some issues
+need to be addressed in the current version of TGB, including mismatched data
+statistics, inaccurate evaluation metric computation, and so on; (2) different
+models depict varying performance across various datasets, which is in line
+with previous observations; (3) the performance of some baselines can be
+significantly improved over the reported results in TGB when using DyGLib. This
+work aims to ease the researchers' efforts in evaluating various dynamic graph
+learning methods on TGB and attempts to offer results that can be directly
+referenced in the follow-up research. All the used resources in this project
+are publicly available at https://github.com/yule-BUAA/DyGLib_TGB. This work is
+in progress, and feedback from the community is welcomed for improvements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint, in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdvDiff: Generating Unrestricted Adversarial Examples using Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12499v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12499v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuelong Dai, Kaisheng Liang, Bin Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unrestricted adversarial attacks present a serious threat to deep learning
+models and adversarial defense techniques. They pose severe security problems
+for deep learning applications because they can effectively bypass defense
+mechanisms. However, previous attack methods often utilize Generative
+Adversarial Networks (GANs), which are not theoretically provable and thus
+generate unrealistic examples by incorporating adversarial objectives,
+especially for large-scale datasets like ImageNet. In this paper, we propose a
+new method, called AdvDiff, to generate unrestricted adversarial examples with
+diffusion models. We design two novel adversarial guidance techniques to
+conduct adversarial sampling in the reverse generation process of diffusion
+models. These two techniques are effective and stable to generate high-quality,
+realistic adversarial examples by integrating gradients of the target
+classifier interpretably. Experimental results on MNIST and ImageNet datasets
+demonstrate that AdvDiff is effective to generate unrestricted adversarial
+examples, which outperforms GAN-based methods in terms of attack performance
+and generation quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A faster and simpler algorithm for learning shallow networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sitan Chen, Shyam Narayanan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We revisit the well-studied problem of learning a linear combination of $k$
+ReLU activations given labeled examples drawn from the standard $d$-dimensional
+Gaussian measure. Chen et al. [CDG+23] recently gave the first algorithm for
+this problem to run in $\text{poly}(d,1/\varepsilon)$ time when $k = O(1)$,
+where $\varepsilon$ is the target error. More precisely, their algorithm runs
+in time $(d/\varepsilon)^{\mathrm{quasipoly}(k)}$ and learns over multiple
+stages. Here we show that a much simpler one-stage version of their algorithm
+suffices, and moreover its runtime is only $(d/\varepsilon)^{O(k^2)}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Universal and Robust 3D Molecular Representations with Graph
+  Convolutional Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12491v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12491v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Zhang, Yang Liu, Li Xie, Lei Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To learn accurate representations of molecules, it is essential to consider
+both chemical and geometric features. To encode geometric information, many
+descriptors have been proposed in constrained circumstances for specific types
+of molecules and do not have the properties to be ``robust": 1. Invariant to
+rotations and translations; 2. Injective when embedding molecular structures.
+In this work, we propose a universal and robust Directional Node Pair (DNP)
+descriptor based on the graph representations of 3D molecules. Our DNP
+descriptor is robust compared to previous ones and can be applied to multiple
+molecular types. To combine the DNP descriptor and chemical features in
+molecules, we construct the Robust Molecular Graph Convolutional Network
+(RoM-GCN) which is capable to take both node and edge features into
+consideration when generating molecule representations. We evaluate our model
+on protein and small molecule datasets. Our results validate the superiority of
+the DNP descriptor in incorporating 3D geometric information of molecules.
+RoM-GCN outperforms all compared baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Resource Allocation Policy: Vertex-GNN or Edge-GNN? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12480v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12480v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Peng, Jia Guo, Chenyang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) update the hidden representations of vertices
+(called Vertex-GNNs) or hidden representations of edges (called Edge-GNNs) by
+processing and pooling the information of neighboring vertices and edges and
+combining to incorporate graph topology. When learning resource allocation
+policies, GNNs cannot perform well if their expressive power are weak, i.e., if
+they cannot differentiate all input features such as channel matrices. In this
+paper, we analyze the expressive power of the Vertex-GNNs and Edge-GNNs for
+learning three representative wireless policies: link scheduling, power
+control, and precoding policies. We find that the expressive power of the GNNs
+depend on the linearity and output dimensions of the processing and combination
+functions. When linear processors are used, the Vertex-GNNs cannot
+differentiate all channel matrices due to the loss of channel information,
+while the Edge-GNNs can. When learning the precoding policy, even the
+Vertex-GNNs with non-linear processors may not be with strong expressive
+ability due to the dimension compression. We proceed to provide necessary
+conditions for the GNNs to well learn the precoding policy. Simulation results
+validate the analyses and show that the Edge-GNNs can achieve the same
+performance as the Vertex-GNNs with much lower training and inference time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model-free generalized fiducial inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan P Williams
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the need for the development of safe and reliable methods for
+uncertainty quantification in machine learning, I propose and develop ideas for
+a model-free statistical framework for imprecise probabilistic prediction
+inference. This framework facilitates uncertainty quantification in the form of
+prediction sets that offer finite sample control of type 1 errors, a property
+shared with conformal prediction sets, but this new approach also offers more
+versatile tools for imprecise probabilistic reasoning. Furthermore, I propose
+and consider the theoretical and empirical properties of a precise
+probabilistic approximation to the model-free imprecise framework.
+Approximating a belief/plausibility measure pair by an [optimal in some sense]
+probability measure in the credal set is a critical resolution needed for the
+broader adoption of imprecise probabilistic approaches to inference in
+statistical and machine learning communities. It is largely undetermined in the
+statistical and machine learning literatures, more generally, how to properly
+quantify uncertainty in that there is no generally accepted standard of
+accountability of stated uncertainties. The research I present in this
+manuscript is aimed at motivating a framework for statistical inference with
+reliability and accountability as the guiding principles.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Data Distillation: Do Not Overlook Calibration <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongyao Zhu, Bowen Lei, Jie Zhang, Yanbo Fang, Ruqi Zhang, Yiqun Xie, Dongkuan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks trained on distilled data often produce over-confident output
+and require correction by calibration methods. Existing calibration methods
+such as temperature scaling and mixup work well for networks trained on
+original large-scale data. However, we find that these methods fail to
+calibrate networks trained on data distilled from large source datasets. In
+this paper, we show that distilled data lead to networks that are not
+calibratable due to (i) a more concentrated distribution of the maximum logits
+and (ii) the loss of information that is semantically meaningful but unrelated
+to classification tasks. To address this problem, we propose Masked Temperature
+Scaling (MTS) and Masked Distillation Training (MDT) which mitigate the
+limitations of distilled data and achieve better calibration results while
+maintaining the efficiency of dataset distillation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rates of Approximation by ReLU Shallow Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Mao, Ding-Xuan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks activated by the rectified linear unit (ReLU) play a central
+role in the recent development of deep learning. The topic of approximating
+functions from H\"older spaces by these networks is crucial for understanding
+the efficiency of the induced learning algorithms. Although the topic has been
+well investigated in the setting of deep neural networks with many layers of
+hidden neurons, it is still open for shallow networks having only one hidden
+layer. In this paper, we provide rates of uniform approximation by these
+networks. We show that ReLU shallow neural networks with $m$ hidden neurons can
+uniformly approximate functions from the H\"older space $W_\infty^r([-1, 1]^d)$
+with rates $O((\log m)^{\frac{1}{2} +d}m^{-\frac{r}{d}\frac{d+2}{d+4}})$ when
+$r<d/2 +2$. Such rates are very close to the optimal one $O(m^{-\frac{r}{d}})$
+in the sense that $\frac{d+2}{d+4}$ is close to $1$, when the dimension $d$ is
+large.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the reliability of automatically generated pedestrian and
+  bicycle crash surrogates 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13178v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13178v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Agnimitra Sengupta, S. Ilgin Guler, Vikash V. Gayah, Shannon Warchol
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vulnerable road users (VRUs), such as pedestrians and bicyclists, are at a
+higher risk of being involved in crashes with motor vehicles, and crashes
+involving VRUs also are more likely to result in severe injuries or fatalities.
+Signalized intersections are a major safety concern for VRUs due to their
+complex and dynamic nature, highlighting the need to understand how these road
+users interact with motor vehicles and deploy evidence-based countermeasures to
+improve safety performance. Crashes involving VRUs are relatively infrequent,
+making it difficult to understand the underlying contributing factors. An
+alternative is to identify and use conflicts between VRUs and motorized
+vehicles as a surrogate for safety performance. Automatically detecting these
+conflicts using a video-based systems is a crucial step in developing smart
+infrastructure to enhance VRU safety. The Pennsylvania Department of
+Transportation conducted a study using video-based event monitoring system to
+assess VRU and motor vehicle interactions at fifteen signalized intersections
+across Pennsylvania to improve VRU safety performance. This research builds on
+that study to assess the reliability of automatically generated surrogates in
+predicting confirmed conflicts using advanced data-driven models. The surrogate
+data used for analysis include automatically collectable variables such as
+vehicular and VRU speeds, movements, post-encroachment time, in addition to
+manually collected variables like signal states, lighting, and weather
+conditions. The findings highlight the varying importance of specific
+surrogates in predicting true conflicts, some being more informative than
+others. The findings can assist transportation agencies to collect the right
+types of data to help prioritize infrastructure investments, such as bike lanes
+and crosswalks, and evaluate their effectiveness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-UAV Speed Control with Collision Avoidance and Handover-aware Cell
+  Association: DRL with Action Branching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13158v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13158v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijiang Yan, Wael Jaafar, Bassant Selim, Hina Tabassum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a deep reinforcement learning solution for optimizing
+multi-UAV cell-association decisions and their moving velocity on a 3D aerial
+highway. The objective is to enhance transportation and communication
+performance, including collision avoidance, connectivity, and handovers. The
+problem is formulated as a Markov decision process (MDP) with UAVs' states
+defined by velocities and communication data rates. We propose a neural
+architecture with a shared decision module and multiple network branches, each
+dedicated to a specific action dimension in a 2D transportation-communication
+space. This design efficiently handles the multi-dimensional action space,
+allowing independence for individual action dimensions. We introduce two
+models, Branching Dueling Q-Network (BDQ) and Branching Dueling Double Deep
+Q-Network (Dueling DDQN), to demonstrate the approach. Simulation results show
+a significant improvement of 18.32% compared to existing benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discovering interpretable elastoplasticity models via the neural
+  polynomial method enabled symbolic regressions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13149v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13149v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bahador Bahmani, Hyoung Suk Suh, WaiChing Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional neural network elastoplasticity models are often perceived as
+lacking interpretability. This paper introduces a two-step machine-learning
+approach that returns mathematical models interpretable by human experts. In
+particular, we introduce a surrogate model where yield surfaces are expressed
+in terms of a set of single-variable feature mappings obtained from supervised
+learning. A postprocessing step is then used to re-interpret the set of
+single-variable neural network mapping functions into mathematical form through
+symbolic regression. This divide-and-conquer approach provides several
+important advantages. First, it enables us to overcome the scaling issue of
+symbolic regression algorithms. From a practical perspective, it enhances the
+portability of learned models for partial differential equation solvers written
+in different programming languages. Finally, it enables us to have a concrete
+understanding of the attributes of the materials, such as convexity and
+symmetries of models, through automated derivations and reasoning. Numerical
+examples have been provided, along with an open-source code to enable
+third-party validation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extending Path-Dependent NJ-ODEs to Noisy Observations and a Dependent
+  Observation Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Andersson, Jakob Heiss, Florian Krach, Josef Teichmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Path-Dependent Neural Jump ODE (PD-NJ-ODE) is a model for predicting
+continuous-time stochastic processes with irregular and incomplete
+observations. In particular, the method learns optimal forecasts given
+irregularly sampled time series of incomplete past observations. So far the
+process itself and the coordinate-wise observation times were assumed to be
+independent and observations were assumed to be noiseless. In this work we
+discuss two extensions to lift these restrictions and provide theoretical
+guarantees as well as empirical examples for them.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Does Progress On Object Recognition Benchmarks Improve Real-World
+  Generalization? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Megan Richards, Polina Kirichenko, Diane Bouchacourt, Mark Ibrahim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For more than a decade, researchers have measured progress in object
+recognition on ImageNet-based generalization benchmarks such as ImageNet-A, -C,
+and -R. Recent advances in foundation models, trained on orders of magnitude
+more data, have begun to saturate these standard benchmarks, but remain brittle
+in practice. This suggests standard benchmarks, which tend to focus on
+predefined or synthetic changes, may not be sufficient for measuring real world
+generalization. Consequently, we propose studying generalization across
+geography as a more realistic measure of progress using two datasets of objects
+from households across the globe. We conduct an extensive empirical evaluation
+of progress across nearly 100 vision models up to most recent foundation
+models. We first identify a progress gap between standard benchmarks and
+real-world, geographical shifts: progress on ImageNet results in up to 2.5x
+more progress on standard generalization benchmarks than real-world
+distribution shifts. Second, we study model generalization across geographies
+by measuring the disparities in performance across regions, a more fine-grained
+measure of real world generalization. We observe all models have large
+geographic disparities, even foundation CLIP models, with differences of 7-20%
+in accuracy between regions. Counter to modern intuition, we discover progress
+on standard benchmarks fails to improve geographic disparities and often
+exacerbates them: geographic disparities between the least performant models
+and today's best models have more than tripled. Our results suggest scaling
+alone is insufficient for consistent robustness to real-world distribution
+shifts. Finally, we highlight in early experiments how simple last layer
+retraining on more representative, curated data can complement scaling as a
+promising direction of future work, reducing geographic disparity on both
+benchmarks by over two-thirds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ simPLE: a visuotactile method learned in simulation to precisely pick,
+  localize, regrasp, and place objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13133v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13133v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maria Bauza, Antonia Bronars, Yifan Hou, Ian Taylor, Nikhil Chavan-Dafle, Alberto Rodriguez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing robotic systems have a clear tension between generality and
+precision. Deployed solutions for robotic manipulation tend to fall into the
+paradigm of one robot solving a single task, lacking precise generalization,
+i.e., the ability to solve many tasks without compromising on precision. This
+paper explores solutions for precise and general pick-and-place. In precise
+pick-and-place, i.e. kitting, the robot transforms an unstructured arrangement
+of objects into an organized arrangement, which can facilitate further
+manipulation. We propose simPLE (simulation to Pick Localize and PLacE) as a
+solution to precise pick-and-place. simPLE learns to pick, regrasp and place
+objects precisely, given only the object CAD model and no prior experience. We
+develop three main components: task-aware grasping, visuotactile perception,
+and regrasp planning. Task-aware grasping computes affordances of grasps that
+are stable, observable, and favorable to placing. The visuotactile perception
+model relies on matching real observations against a set of simulated ones
+through supervised learning. Finally, we compute the desired robot motion by
+solving a shortest path problem on a graph of hand-to-hand regrasps. On a
+dual-arm robot equipped with visuotactile sensing, we demonstrate
+pick-and-place of 15 diverse objects with simPLE. The objects span a wide range
+of shapes and simPLE achieves successful placements into structured
+arrangements with 1mm clearance over 90% of the time for 6 objects, and over
+80% of the time for 11 objects. Videos are available at
+http://mcube.mit.edu/research/simPLE.html .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 6 figures, 2 tables, submitted to Science Robotics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Differentially Private Weighted Empirical Risk Minimization Procedure
+  and its Application to Outcome Weighted Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13127v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13127v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Spencer Giddens, Yiwang Zhou, Kevin R. Krull, Tara M. Brinkman, Peter X. K. Song, Fang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is commonplace to use data containing personal information to build
+predictive models in the framework of empirical risk minimization (ERM). While
+these models can be highly accurate in prediction, results obtained from these
+models with the use of sensitive data may be susceptible to privacy attacks.
+Differential privacy (DP) is an appealing framework for addressing such data
+privacy issues by providing mathematically provable bounds on the privacy loss
+incurred when releasing information from sensitive data. Previous work has
+primarily concentrated on applying DP to unweighted ERM. We consider an
+important generalization to weighted ERM (wERM). In wERM, each individual's
+contribution to the objective function can be assigned varying weights. In this
+context, we propose the first differentially private wERM algorithm, backed by
+a rigorous theoretical proof of its DP guarantees under mild regularity
+conditions. Extending the existing DP-ERM procedures to wERM paves a path to
+deriving privacy-preserving learning methods for individualized treatment
+rules, including the popular outcome weighted learning (OWL). We evaluate the
+performance of the DP-wERM application to OWL in a simulation study and in a
+real clinical trial of melatonin for sleep health. All empirical results
+demonstrate the viability of training OWL models via wERM with DP guarantees
+while maintaining sufficiently useful model performance. Therefore, we
+recommend practitioners consider implementing the proposed privacy-preserving
+OWL procedure in real-world scenarios involving sensitive data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages and 2 figures for the main manuscript, 5 pages and 2 figures
+  for the supplementary materials</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conformal prediction for frequency-severity modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13124v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13124v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Helton Graziadei, Paulo C. Marques F., Eduardo F. L. de Melo, Rodrigo S. Targino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a nonparametric model-agnostic framework for building prediction
+intervals of insurance claims, with finite sample statistical guarantees,
+extending the technique of split conformal prediction to the domain of
+two-stage frequency-severity modeling. The effectiveness of the framework is
+showcased with simulated and real datasets. When the underlying severity model
+is a random forest, we extend the two-stage split conformal prediction
+procedure, showing how the out-of-bag mechanism can be leveraged to eliminate
+the need for a calibration set and to enable the production of prediction
+intervals with adaptive width.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Explainable Geometric-Weighted Graph Attention Network for
+  Identifying Functional Networks Associated with Gait Impairment <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13108v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13108v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Favour Nerrise, Qingyu Zhao, Kathleen L. Poston, Kilian M. Pohl, Ehsan Adeli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the hallmark symptoms of Parkinson's Disease (PD) is the progressive
+loss of postural reflexes, which eventually leads to gait difficulties and
+balance problems. Identifying disruptions in brain function associated with
+gait impairment could be crucial in better understanding PD motor progression,
+thus advancing the development of more effective and personalized therapeutics.
+In this work, we present an explainable, geometric, weighted-graph attention
+neural network (xGW-GAT) to identify functional networks predictive of the
+progression of gait difficulties in individuals with PD. xGW-GAT predicts the
+multi-class gait impairment on the MDS Unified PD Rating Scale (MDS-UPDRS). Our
+computational- and data-efficient model represents functional connectomes as
+symmetric positive definite (SPD) matrices on a Riemannian manifold to
+explicitly encode pairwise interactions of entire connectomes, based on which
+we learn an attention mask yielding individual- and group-level explainability.
+Applied to our resting-state functional MRI (rs-fMRI) dataset of individuals
+with PD, xGW-GAT identifies functional connectivity patterns associated with
+gait impairment in PD and offers interpretable explanations of functional
+subnetworks associated with motor impairment. Our model successfully
+outperforms several existing methods while simultaneously revealing
+clinically-relevant connectivity patterns. The source code is available at
+https://github.com/favour-nerrise/xGW-GAT .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 26th International Conference on Medical Image
+  Computing and Computer Assisted Intervention (MICCAI 2023). MICCAI
+  Student-Author Registration (STAR) Award. 11 pages, 2 figures, 1 table,
+  appendix. Source Code: https://github.com/favour-nerrise/xGW-GAT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contrastive Example-Based Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13101v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13101v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyle Hatch, Benjamin Eysenbach, Rafael Rafailov, Tianhe Yu, Ruslan Salakhutdinov, Sergey Levine, Chelsea Finn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While many real-world problems that might benefit from reinforcement
+learning, these problems rarely fit into the MDP mold: interacting with the
+environment is often expensive and specifying reward functions is challenging.
+Motivated by these challenges, prior work has developed data-driven approaches
+that learn entirely from samples from the transition dynamics and examples of
+high-return states. These methods typically learn a reward function from
+high-return states, use that reward function to label the transitions, and then
+apply an offline RL algorithm to these transitions. While these methods can
+achieve good results on many tasks, they can be complex, often requiring
+regularization and temporal difference updates. In this paper, we propose a
+method for offline, example-based control that learns an implicit model of
+multi-step transitions, rather than a reward function. We show that this
+implicit model can represent the Q-values for the example-based control
+problem. Across a range of state-based and image-based offline control tasks,
+our method outperforms baselines that use learned reward functions; additional
+experiments demonstrate improved robustness and scaling with dataset size.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is an updated version of a manuscript that originally appeared
+  at L4DC 2023. The project website is here
+  https://sites.google.com/view/laeo-rl</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Label Noise: Correcting a Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Toner, Amos Storkey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training neural network classifiers on datasets with label noise poses a risk
+of overfitting them to the noisy labels. To address this issue, researchers
+have explored alternative loss functions that aim to be more robust. However,
+many of these alternatives are heuristic in nature and still vulnerable to
+overfitting or underfitting. In this work, we propose a more direct approach to
+tackling overfitting caused by label noise. We observe that the presence of
+label noise implies a lower bound on the noisy generalised risk. Building upon
+this observation, we propose imposing a lower bound on the empirical risk
+during training to mitigate overfitting. Our main contribution is providing
+theoretical results that yield explicit, easily computable bounds on the
+minimum achievable noisy risk for different loss functions. We empirically
+demonstrate that using these bounds significantly enhances robustness in
+various settings, with virtually no additional computational cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fairness Under Demographic Scarce Regime 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrik Joslin Kenfack, Samira Ebrahimi Kahou, Ulrich Aïvodji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing works on fairness assume the model has full access to
+demographic information. However, there exist scenarios where demographic
+information is partially available because a record was not maintained
+throughout data collection or due to privacy reasons. This setting is known as
+demographic scarce regime. Prior research have shown that training an attribute
+classifier to replace the missing sensitive attributes (proxy) can still
+improve fairness. However, the use of proxy-sensitive attributes worsens
+fairness-accuracy trade-offs compared to true sensitive attributes. To address
+this limitation, we propose a framework to build attribute classifiers that
+achieve better fairness-accuracy trade-offs. Our method introduces uncertainty
+awareness in the attribute classifier and enforces fairness on samples with
+demographic information inferred with the lowest uncertainty. We show
+empirically that enforcing fairness constraints on samples with uncertain
+sensitive attributes is detrimental to fairness and accuracy. Our experiments
+on two datasets showed that the proposed framework yields models with
+significantly better fairness-accuracy trade-offs compared to classic attribute
+classifiers. Surprisingly, our framework outperforms models trained with
+constraints on the true sensitive attributes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 7 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Certified Training: Towards Better Accuracy-Robustness
+  Tradeoffs <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhakshylyk Nurlanov, Frank R. Schmidt, Florian Bernard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As deep learning models continue to advance and are increasingly utilized in
+real-world systems, the issue of robustness remains a major challenge. Existing
+certified training methods produce models that achieve high provable robustness
+guarantees at certain perturbation levels. However, the main problem of such
+models is a dramatically low standard accuracy, i.e. accuracy on clean
+unperturbed data, that makes them impractical. In this work, we consider a more
+realistic perspective of maximizing the robustness of a model at certain levels
+of (high) standard accuracy. To this end, we propose a novel certified training
+method based on a key insight that training with adaptive certified radii helps
+to improve both the accuracy and robustness of the model, advancing
+state-of-the-art accuracy-robustness tradeoffs. We demonstrate the
+effectiveness of the proposed method on MNIST, CIFAR-10, and TinyImageNet
+datasets. Particularly, on CIFAR-10 and TinyImageNet, our method yields models
+with up to two times higher robustness, measured as an average certified radius
+of a test set, at the same levels of standard accuracy compared to baseline
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at ICML 2023 workshop "New Frontiers in Adversarial Machine
+  Learning"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ General-Purpose Multi-Modal OOD Detection Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viet Duong, Qiong Wu, Zhengyi Zhou, Eric Zavesky, Jiahe Chen, Xiangzhou Liu, Wen-Ling Hsu, Huajie Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection identifies test samples that differ from
+the training data, which is critical to ensuring the safety and reliability of
+machine learning (ML) systems. While a plethora of methods have been developed
+to detect uni-modal OOD samples, only a few have focused on multi-modal OOD
+detection. Current contrastive learning-based methods primarily study
+multi-modal OOD detection in a scenario where both a given image and its
+corresponding textual description come from a new domain. However, real-world
+deployments of ML systems may face more anomaly scenarios caused by multiple
+factors like sensor faults, bad weather, and environmental changes. Hence, the
+goal of this work is to simultaneously detect from multiple different OOD
+scenarios in a fine-grained manner. To reach this goal, we propose a
+general-purpose weakly-supervised OOD detection framework, called WOOD, that
+combines a binary classifier and a contrastive learning component to reap the
+benefits of both. In order to better distinguish the latent representations of
+in-distribution (ID) and OOD samples, we adopt the Hinge loss to constrain
+their similarity. Furthermore, we develop a new scoring metric to integrate the
+prediction results from both the binary classifier and contrastive learning for
+identifying OOD samples. We evaluate the proposed WOOD model on multiple
+real-world datasets, and the experimental results demonstrate that the WOOD
+model outperforms the state-of-the-art methods for multi-modal OOD detection.
+Importantly, our approach is able to achieve high accuracy in OOD detection in
+three different OOD scenarios simultaneously. The source code will be made
+publicly available upon publication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature Gradient Flow for Interpreting Deep Neural Networks in Head and
+  Neck Cancer Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinzhu Jin, Jonathan C. Garneau, P. Thomas Fletcher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces feature gradient flow, a new technique for interpreting
+deep learning models in terms of features that are understandable to humans.
+The gradient flow of a model locally defines nonlinear coordinates in the input
+data space representing the information the model is using to make its
+decisions. Our idea is to measure the agreement of interpretable features with
+the gradient flow of a model. To then evaluate the importance of a particular
+feature to the model, we compare that feature's gradient flow measure versus
+that of a baseline noise feature. We then develop a technique for training
+neural networks to be more interpretable by adding a regularization term to the
+loss function that encourages the model gradients to align with those of chosen
+interpretable features. We test our method in a convolutional neural network
+prediction of distant metastasis of head and neck cancer from a computed
+tomography dataset from the Cancer Imaging Archive.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exphormer: Sparse <span class="highlight-title">Transformer</span>s for Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06147v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06147v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamed Shirzad, Ameya Velingker, Balaji Venkatachalam, Danica J. Sutherland, Ali Kemal Sinop
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph transformers have emerged as a promising architecture for a variety of
+graph learning and representation tasks. Despite their successes, though, it
+remains challenging to scale graph transformers to large graphs while
+maintaining accuracy competitive with message-passing networks. In this paper,
+we introduce Exphormer, a framework for building powerful and scalable graph
+transformers. Exphormer consists of a sparse attention mechanism based on two
+mechanisms: virtual global nodes and expander graphs, whose mathematical
+characteristics, such as spectral expansion, pseduorandomness, and sparsity,
+yield graph transformers with complexity only linear in the size of the graph,
+while allowing us to prove desirable theoretical properties of the resulting
+transformer models. We show that incorporating Exphormer into the
+recently-proposed GraphGPS framework produces models with competitive empirical
+results on a wide variety of graph datasets, including state-of-the-art results
+on three datasets. We also show that Exphormer can scale to datasets on larger
+graphs than shown in previous graph transformer architectures. Code can be
+found at \url{https://github.com/hamed1375/Exphormer}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Segmenting Known Objects and Unseen Unknowns without Prior Knowledge <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05407v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05407v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefano Gasperini, Alvaro Marcos-Ramiro, Michael Schmidt, Nassir Navab, Benjamin Busam, Federico Tombari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Panoptic segmentation methods assign a known class to each pixel given in
+input. Even for state-of-the-art approaches, this inevitably enforces decisions
+that systematically lead to wrong predictions for objects outside the training
+categories. However, robustness against out-of-distribution samples and corner
+cases is crucial in safety-critical settings to avoid dangerous consequences.
+Since real-world datasets cannot contain enough data points to adequately
+sample the long tail of the underlying distribution, models must be able to
+deal with unseen and unknown scenarios as well. Previous methods targeted this
+by re-identifying already-seen unlabeled objects. In this work, we propose the
+necessary step to extend segmentation with a new setting which we term holistic
+segmentation. Holistic segmentation aims to identify and separate objects of
+unseen unknown categories into instances, without any prior knowledge about
+them, while performing panoptic segmentation of known classes. We tackle this
+new problem with U3HS, which finds unknowns as highly uncertain regions and
+clusters their corresponding instance-aware embeddings into individual objects.
+By doing so, for the first time in panoptic segmentation with unknown objects,
+our U3HS is trained without unknown categories, reducing assumptions and
+leaving the settings as unconstrained as in real-life scenarios. Extensive
+experiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate
+the effectiveness of U3HS for this new, challenging, and assumptions-free
+setting called holistic segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Do <span class="highlight-title">Transformer</span>s Learn Topic Structure: Towards a Mechanistic
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04245v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04245v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Li, Yuanzhi Li, Andrej Risteski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the successes of transformers across many domains are indisputable,
+accurate understanding of the learning mechanics is still largely lacking.
+Their capabilities have been probed on benchmarks which include a variety of
+structured and reasoning tasks -- but mathematical understanding is lagging
+substantially behind. Recent lines of work have begun studying representational
+aspects of this question: that is, the size/depth/complexity of attention-based
+networks to perform certain tasks. However, there is no guarantee the learning
+dynamics will converge to the constructions proposed. In our paper, we provide
+fine-grained mechanistic understanding of how transformers learn "semantic
+structure", understood as capturing co-occurrence structure of words.
+Precisely, we show, through a combination of mathematical analysis and
+experiments on Wikipedia data and synthetic data modeled by Latent Dirichlet
+Allocation (LDA), that the embedding layer and the self-attention layer encode
+the topical structure. In the former case, this manifests as higher average
+inner product of embeddings between same-topic words. In the latter, it
+manifests as higher average pairwise attention between same-topic words. The
+mathematical results involve several assumptions to make the analysis
+tractable, which we verify on data, and might be of independent interest as
+well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting the Robustness of the Minimum Error Entropy Criterion: A
+  Transfer Learning Case Study <span class="chip">ECAI-23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08572v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08572v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis Pedro Silvestrin, Shujian Yu, Mark Hoogendoorn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coping with distributional shifts is an important part of transfer learning
+methods in order to perform well in real-life tasks. However, most of the
+existing approaches in this area either focus on an ideal scenario in which the
+data does not contain noises or employ a complicated training paradigm or model
+design to deal with distributional shifts. In this paper, we revisit the
+robustness of the minimum error entropy (MEE) criterion, a widely used
+objective in statistical signal processing to deal with non-Gaussian noises,
+and investigate its feasibility and usefulness in real-life transfer learning
+regression tasks, where distributional shifts are common. Specifically, we put
+forward a new theoretical result showing the robustness of MEE against
+covariate shift. We also show that by simply replacing the mean squared error
+(MSE) loss with the MEE on basic transfer learning algorithms such as
+fine-tuning and linear probing, we can achieve competitive performance with
+respect to state-of-the-art transfer learning algorithms. We justify our
+arguments on both synthetic data and 5 real-world time-series data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Manuscript accepted at ECAI-23. Code available at
+  https://github.com/lpsilvestrin/mee-finetune</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Approximation Theory for Metric Space-Valued Functions With A View
+  Towards Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12231v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12231v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anastasis Kratsios, Chong Liu, Matti Lassas, Maarten V. de Hoop, Ivan Dokmanić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the developing mathematics of deep learning, we build universal
+functions approximators of continuous maps between arbitrary Polish metric
+spaces $\mathcal{X}$ and $\mathcal{Y}$ using elementary functions between
+Euclidean spaces as building blocks. Earlier results assume that the target
+space $\mathcal{Y}$ is a topological vector space. We overcome this limitation
+by ``randomization'': our approximators output discrete probability measures
+over $\mathcal{Y}$. When $\mathcal{X}$ and $\mathcal{Y}$ are Polish without
+additional structure, we prove very general qualitative guarantees; when they
+have suitable combinatorial structure, we prove quantitative guarantees for
+H\"{o}lder-like maps, including maps between finite graphs, solution operators
+to rough differential equations between certain Carnot groups, and continuous
+non-linear operators between Banach spaces arising in inverse problems. In
+particular, we show that the required number of Dirac measures is determined by
+the combinatorial structure of $\mathcal{X}$ and $\mathcal{Y}$. For barycentric
+$\mathcal{Y}$, including Banach spaces, $\mathbb{R}$-trees, Hadamard manifolds,
+or Wasserstein spaces on Polish metric spaces, our approximators reduce to
+$\mathcal{Y}$-valued functions. When the Euclidean approximators are neural
+networks, our constructions generalize transformer networks, providing a new
+probabilistic viewpoint of geometric deep learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 Figures, 3 Tables, 78 Pages (Main 40, Proofs 26, Acknowledgments
+  and References 12)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> Learning for Human Activity Recognition Using 700,000
+  Person-days of Wearable Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02909v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02909v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hang Yuan, Shing Chan, Andrew P. Creagh, Catherine Tong, David A. Clifton, Aiden Doherty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in deep learning for human activity recognition have been relatively
+limited due to the lack of large labelled datasets. In this study, we leverage
+self-supervised learning techniques on the UK-Biobank activity tracker
+dataset--the largest of its kind to date--containing more than 700,000
+person-days of unlabelled wearable sensor data. Our resulting activity
+recognition model consistently outperformed strong baselines across seven
+benchmark datasets, with an F1 relative improvement of 2.5%-100% (median
+18.4%), the largest improvements occurring in the smaller datasets. In contrast
+to previous studies, our results generalise across external datasets, devices,
+and environments. Our open-source model will help researchers and developers to
+build customisable and generalisable activity classifiers with high
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Classification of US Supreme Court Cases using <span class="highlight-title">BERT</span>-Based Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08649v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08649v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Vatsal, Adam Meyers, John E. Ortega
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models based on bidirectional encoder representations from transformers
+(BERT) produce state of the art (SOTA) results on many natural language
+processing (NLP) tasks such as named entity recognition (NER), part-of-speech
+(POS) tagging etc. An interesting phenomenon occurs when classifying long
+documents such as those from the US supreme court where BERT-based models can
+be considered difficult to use on a first-pass or out-of-the-box basis. In this
+paper, we experiment with several BERT-based classification techniques for US
+supreme court decisions or supreme court database (SCDB) and compare them with
+the previous SOTA results. We then compare our results specifically with SOTA
+models for long documents. We compare our results for two classification tasks:
+(1) a broad classification task with 15 categories and (2) a fine-grained
+classification task with 279 categories. Our best result produces an accuracy
+of 80\% on the 15 broad categories and 60\% on the fine-grained 279 categories
+which marks an improvement of 8\% and 28\% respectively from previously
+reported SOTA results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Optimal Prescriptive Trees from Observational Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.13628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.13628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathanael Jo, Sina Aghaei, Andrés Gómez, Phebe Vayanos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of learning an optimal prescriptive tree (i.e., an
+interpretable treatment assignment policy in the form of a binary tree) of
+moderate depth, from observational data. This problem arises in numerous
+socially important domains such as public health and personalized medicine,
+where interpretable and data-driven interventions are sought based on data
+gathered in deployment -- through passive collection of data -- rather than
+from randomized trials. We propose a method for learning optimal prescriptive
+trees using mixed-integer optimization (MIO) technology. We show that under
+mild conditions our method is asymptotically exact in the sense that it
+converges to an optimal out-of-sample treatment assignment policy as the number
+of historical data samples tends to infinity. Contrary to existing literature,
+our approach: 1) does not require data to be randomized, 2) does not impose
+stringent assumptions on the learned trees, and 3) has the ability to model
+domain specific constraints. Through extensive computational experiments, we
+demonstrate that our asymptotic guarantees translate to significant performance
+improvements in finite samples, as well as showcase our uniquely flexible
+modeling power by incorporating budget and fairness constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Approximate blocked Gibbs sampling for Bayesian neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.11389v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.11389v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Theodore Papamarkou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, minibatch MCMC sampling for feedforward neural networks is made
+more feasible. To this end, it is proposed to sample subgroups of parameters
+via a blocked Gibbs sampling scheme. By partitioning the parameter space,
+sampling is possible irrespective of layer width. It is also possible to
+alleviate vanishing acceptance rates for increasing depth by reducing the
+proposal variance in deeper layers. Increasing the length of a non-convergent
+chain increases the predictive accuracy in classification tasks, so avoiding
+vanishing acceptance rates and consequently enabling longer chain runs have
+practical benefits. Moreover, non-convergent chain realizations aid in the
+quantification of predictive uncertainty. An open problem is how to perform
+minibatch MCMC sampling for feedforward neural networks in the presence of
+augmented data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizing similarity in noisy setups: the DIBS phenomenon <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.12803v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.12803v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nayara Fonseca, Veronica Guidetti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work uncovers an interplay among data density, noise, and the
+generalization ability in similarity learning. We consider Siamese Neural
+Networks (SNNs), which are the basic form of contrastive learning, and explore
+two types of noise that can impact SNNs, Pair Label Noise (PLN) and Single
+Label Noise (SLN). Our investigation reveals that SNNs exhibit double descent
+behaviour regardless of the training setup and that it is further exacerbated
+by noise. We demonstrate that the density of data pairs is crucial for
+generalization. When SNNs are trained on sparse datasets with the same amount
+of PLN or SLN, they exhibit comparable generalization properties. However, when
+using dense datasets, PLN cases generalize worse than SLN ones in the
+overparametrized region, leading to a phenomenon we call Density-Induced Break
+of Similarity (DIBS). In this regime, PLN similarity violation becomes
+macroscopical, corrupting the dataset to the point where complete interpolation
+cannot be achieved, regardless of the number of model parameters. Our analysis
+also delves into the correspondence between online optimization and offline
+generalization in similarity learning. The results show that this equivalence
+fails in the presence of label noise in all the scenarios considered.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v3: version accepted at ECAI 2023 + Supplementary Material</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ (Ab)using Images and Sounds for Indirect Instruction Injection in
+  Multi-Modal LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10490v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10490v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eugene Bagdasaryan, Tsung-Yin Hsieh, Ben Nassi, Vitaly Shmatikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate how images and sounds can be used for indirect prompt and
+instruction injection in multi-modal LLMs. An attacker generates an adversarial
+perturbation corresponding to the prompt and blends it into an image or audio
+recording. When the user asks the (unmodified, benign) model about the
+perturbed image or audio, the perturbation steers the model to output the
+attacker-chosen text and/or make the subsequent dialog follow the attacker's
+instruction. We illustrate this attack with several proof-of-concept examples
+targeting LLaVa and PandaGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Transformer</span> Training Strategies for Forecasting Multiple Load Time
+  Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10891v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10891v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Hertel, Maximilian Beichter, Benedikt Heidrich, Oliver Neumann, Benjamin Schäfer, Ralf Mikut, Veit Hagenmeyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the smart grid of the future, accurate load forecasts on the level of
+individual clients can help to balance supply and demand locally and to prevent
+grid outages. While the number of monitored clients will increase with the
+ongoing smart meter rollout, the amount of data per client will always be
+limited. We evaluate whether a Transformer load forecasting model benefits from
+a transfer learning strategy, where a global univariate model is trained on the
+load time series from multiple clients. In experiments with two datasets
+containing load time series from several hundred clients, we find that the
+global training strategy is superior to the multivariate and local training
+strategies used in related work. On average, the global training strategy
+results in 21.8% and 12.8% lower forecasting errors than the two other
+strategies, measured across forecasting horizons from one day to one month into
+the future. A comparison to linear models, multi-layer perceptrons and LSTMs
+shows that Transformers are effective for load forecasting when they are
+trained with the global training strategy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated patent extraction powers generative modeling in focused
+  chemical spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08272v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08272v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akshay Subramanian, Kevin P. Greenman, Alexis Gervaix, Tzuhsiung Yang, Rafael Gómez-Bombarelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep generative models have emerged as an exciting avenue for inverse
+molecular design, with progress coming from the interplay between training
+algorithms and molecular representations. One of the key challenges in their
+applicability to materials science and chemistry has been the lack of access to
+sizeable training datasets with property labels. Published patents contain the
+first disclosure of new materials prior to their publication in journals, and
+are a vast source of scientific knowledge that has remained relatively untapped
+in the field of data-driven molecular design. Because patents are filed seeking
+to protect specific uses, molecules in patents can be considered to be weakly
+labeled into application classes. Furthermore, patents published by the US
+Patent and Trademark Office (USPTO) are downloadable and have machine-readable
+text and molecular structures. In this work, we train domain-specific
+generative models using patent data sources by developing an automated pipeline
+to go from USPTO patent digital files to the generation of novel candidates
+with minimal human intervention. We test the approach on two in-class extracted
+datasets, one in organic electronics and another in tyrosine kinase inhibitors.
+We then evaluate the ability of generative models trained on these in-class
+datasets on two categories of tasks (distribution learning and property
+optimization), identify strengths and limitations, and suggest possible
+explanations and remedies that could be used to overcome these in practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Digital Discovery (2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning when to observe: A frugal reinforcement learning framework for
+  a high-cost world <span class="chip">ECML-PKDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02620v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02620v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Colin Bellinger, Mark Crowley, Isaac Tamblyn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) has been shown to learn sophisticated control
+policies for complex tasks including games, robotics, heating and cooling
+systems and text generation. The action-perception cycle in RL, however,
+generally assumes that a measurement of the state of the environment is
+available at each time step without a cost. In applications such as materials
+design, deep-sea and planetary robot exploration and medicine, however, there
+can be a high cost associated with measuring, or even approximating, the state
+of the environment. In this paper, we survey the recently growing literature
+that adopts the perspective that an RL agent might not need, or even want, a
+costly measurement at each time step. Within this context, we propose the Deep
+Dynamic Multi-Step Observationless Agent (DMSOA), contrast it with the
+literature and empirically evaluate it on OpenAI gym and Atari Pong
+environments. Our results, show that DMSOA learns a better policy with fewer
+decision steps and measurements than the considered alternative from the
+literature. The corresponding code is available at:
+\url{https://github.com/cbellinger27/Learning-when-to-observe-in-RL
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at ECML-PKDD 2023 workshop track:
+  Simplification, Compression, Efficiency and Frugality for Artificial
+  Intelligence (SCEFA)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CPDG: A Contrastive <span class="highlight-title">Pre-Train</span>ing Method for Dynamic Graph Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02813v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02813v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanchen Bei, Hao Xu, Sheng Zhou, Huixuan Chi, Haishuai Wang, Mengdi Zhang, Zhao Li, Jiajun Bu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic graph data mining has gained popularity in recent years due to the
+rich information contained in dynamic graphs and their widespread use in the
+real world. Despite the advances in dynamic graph neural networks (DGNNs), the
+rich information and diverse downstream tasks have posed significant
+difficulties for the practical application of DGNNs in industrial scenarios. To
+this end, in this paper, we propose to address them by pre-training and present
+the Contrastive Pre-Training Method for Dynamic Graph Neural Networks (CPDG).
+CPDG tackles the challenges of pre-training for DGNNs, including generalization
+capability and long-short term modeling capability, through a flexible
+structural-temporal subgraph sampler along with structural-temporal contrastive
+pre-training schemes. Extensive experiments conducted on both large-scale
+research and industrial dynamic graph datasets show that CPDG outperforms
+existing methods in dynamic graph pre-training for various downstream tasks
+under three transfer settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Temporally Extended Skills in Continuous Domains as Symbolic
+  Actions for Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.05018v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.05018v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Achterhold, Markus Krimmel, Joerg Stueckler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Problems which require both long-horizon planning and continuous control
+capabilities pose significant challenges to existing reinforcement learning
+agents. In this paper we introduce a novel hierarchical reinforcement learning
+agent which links temporally extended skills for continuous control with a
+forward model in a symbolic discrete abstraction of the environment's state for
+planning. We term our agent SEADS for Symbolic Effect-Aware Diverse Skills. We
+formulate an objective and corresponding algorithm which leads to unsupervised
+learning of a diverse set of skills through intrinsic motivation given a known
+state abstraction. The skills are jointly learned with the symbolic forward
+model which captures the effect of skill execution in the state abstraction.
+After training, we can leverage the skills as symbolic actions using the
+forward model for long-horizon planning and subsequently execute the plan using
+the learned continuous-action control skills. The proposed algorithm learns
+skills and forward models that can be used to solve complex tasks which require
+both continuous control and long-horizon planning capabilities with high
+success rate. It compares favorably with other flat and hierarchical
+reinforcement learning baseline agents and is successfully demonstrated with a
+real robot.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website (including video) is available at
+  https://seads.is.tue.mpg.de/. (v2) Accepted for publication at the 6th
+  Conference on Robot Learning (CoRL) 2022, Auckland, New Zealand. (v3) Added
+  details on checkpointing (S.8.1), with references on p.7, p.8, p.21 to
+  clarify number of env. steps of reported results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdaBest: Minimizing Client Drift in Federated Learning via Adaptive Bias
+  Estimation <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.13170v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.13170v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farshid Varno, Marzie Saghayi, Laya Rafiee Sevyeri, Sharut Gupta, Stan Matwin, Mohammad Havaei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Federated Learning (FL), a number of clients or devices collaborate to
+train a model without sharing their data. Models are optimized locally at each
+client and further communicated to a central hub for aggregation. While FL is
+an appealing decentralized training paradigm, heterogeneity among data from
+different clients can cause the local optimization to drift away from the
+global objective. In order to estimate and therefore remove this drift,
+variance reduction techniques have been incorporated into FL optimization
+recently. However, these approaches inaccurately estimate the clients' drift
+and ultimately fail to remove it properly. In this work, we propose an adaptive
+algorithm that accurately estimates drift across clients. In comparison to
+previous works, our approach necessitates less storage and communication
+bandwidth, as well as lower compute costs. Additionally, our proposed
+methodology induces stability by constraining the norm of estimates for client
+drift, making it more practical for large scale FL. Experimental findings
+demonstrate that the proposed algorithm converges significantly faster and
+achieves higher accuracy than the baselines across various FL benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at ECCV 2022; Corrected some typos in
+  the text and a baseline algorithm</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deployment of Image Analysis Algorithms under Prevalence Shifts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12540v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12540v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Godau, Piotr Kalinowski, Evangelia Christodoulou, Annika Reinke, Minu Tizabi, Luciana Ferrer, Paul Jäger, Lena Maier-Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain gaps are among the most relevant roadblocks in the clinical
+translation of machine learning (ML)-based solutions for medical image
+analysis. While current research focuses on new training paradigms and network
+architectures, little attention is given to the specific effect of prevalence
+shifts on an algorithm deployed in practice. Such discrepancies between class
+frequencies in the data used for a method's development/validation and that in
+its deployment environment(s) are of great importance, for example in the
+context of artificial intelligence (AI) democratization, as disease prevalences
+may vary widely across time and location. Our contribution is twofold. First,
+we empirically demonstrate the potentially severe consequences of missing
+prevalence handling by analyzing (i) the extent of miscalibration, (ii) the
+deviation of the decision threshold from the optimum, and (iii) the ability of
+validation metrics to reflect neural network performance on the deployment
+population as a function of the discrepancy between development and deployment
+prevalence. Second, we propose a workflow for prevalence-aware image
+classification that uses estimated deployment prevalences to adjust a trained
+classifier to a new environment, without requiring additional annotated
+deployment data. Comprehensive experiments based on a diverse set of 30 medical
+classification tasks showcase the benefit of the proposed workflow in
+generating better classifier decisions and more reliable performance estimates
+compared to current practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rényi Divergence Deep Mutual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05732v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05732v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weipeng Huang, Junjie Tao, Changbo Deng, Ming Fan, Wenqiang Wan, Qi Xiong, Guangyuan Piao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper revisits Deep Mutual Learning (DML), a simple yet effective
+computing paradigm. We propose using R\'{e}nyi divergence instead of the KL
+divergence, which is more flexible and tunable, to improve vanilla DML. This
+modification is able to consistently improve performance over vanilla DML with
+limited additional complexity. The convergence properties of the proposed
+paradigm are analyzed theoretically, and Stochastic Gradient Descent with a
+constant learning rate is shown to converge with $\mathcal{O}(1)$-bias in the
+worst case scenario for nonconvex optimization tasks. That is, learning will
+reach nearby local optima but continue searching within a bounded scope, which
+may help mitigate overfitting. Finally, our extensive empirical results
+demonstrate the advantage of combining DML and R\'{e}nyi divergence, leading to
+further improvement in model generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning-based Anonymization of Chest Radiographs: A
+  Utility-preserving Measure for Patient Privacy <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.11531v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.11531v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Packhäuser, Sebastian Gündel, Florian Thamm, Felix Denzinger, Andreas Maier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust and reliable anonymization of chest radiographs constitutes an
+essential step before publishing large datasets of such for research purposes.
+The conventional anonymization process is carried out by obscuring personal
+information in the images with black boxes and removing or replacing
+meta-information. However, such simple measures retain biometric information in
+the chest radiographs, allowing patients to be re-identified by a linkage
+attack. Therefore, there is an urgent need to obfuscate the biometric
+information appearing in the images. We propose the first deep learning-based
+approach (PriCheXy-Net) to targetedly anonymize chest radiographs while
+maintaining data utility for diagnostic and machine learning purposes. Our
+model architecture is a composition of three independent neural networks that,
+when collectively used, allow for learning a deformation field that is able to
+impede patient re-identification. Quantitative results on the ChestX-ray14
+dataset show a reduction of patient re-identification from 81.8% to 57.7% (AUC)
+after re-training with little impact on the abnormality classification
+performance. This indicates the ability to preserve underlying abnormality
+patterns while increasing patient privacy. Lastly, we compare our proposed
+anonymization approach with two other obfuscation-based methods (Privacy-Net,
+DP-Pix) and demonstrate the superiority of our method towards resolving the
+privacy-utility trade-off for chest radiographs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizable Embeddings with Cross-batch Metric Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07620v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07620v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeti Z. Gurbuz, A. Aydin Alatan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Global average pooling (GAP) is a popular component in deep metric learning
+(DML) for aggregating features. Its effectiveness is often attributed to
+treating each feature vector as a distinct semantic entity and GAP as a
+combination of them. Albeit substantiated, such an explanation's algorithmic
+implications to learn generalizable entities to represent unseen classes, a
+crucial DML goal, remain unclear. To address this, we formulate GAP as a convex
+combination of learnable prototypes. We then show that the prototype learning
+can be expressed as a recursive process fitting a linear predictor to a batch
+of samples. Building on that perspective, we consider two batches of disjoint
+classes at each iteration and regularize the learning by expressing the samples
+of a batch with the prototypes that are fitted to the other batch. We validate
+our approach on 4 popular DML benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>\c{opyright} 2023 IEEE. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses, in any current or
+  future media, including reprinting/republishing this material for advertising
+  or promotional purposes, creating new collective works, for resale or
+  redistribution to servers or lists, or reuse of any copyrighted component of
+  this work in other works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Shuffled Multi-Channel Sparse Signal Recovery <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.07368v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.07368v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taulant Koka, Manolis C. Tsakiris, Michael Muma, Benjamín Béjar Haro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mismatches between samples and their respective channel or target commonly
+arise in several real-world applications. For instance, whole-brain calcium
+imaging of freely moving organisms, multiple-target tracking or multi-person
+contactless vital sign monitoring may be severely affected by mismatched
+sample-channel assignments. To systematically address this fundamental problem,
+we pose it as a signal reconstruction problem where we have lost
+correspondences between the samples and their respective channels. Assuming
+that we have a sensing matrix for the underlying signals, we show that the
+problem is equivalent to a structured unlabeled sensing problem, and establish
+sufficient conditions for unique recovery. To the best of our knowledge, a
+sampling result for the reconstruction of shuffled multi-channel signals has
+not been considered in the literature and existing methods for unlabeled
+sensing cannot be directly applied. We extend our results to the case where the
+signals admit a sparse representation in an overcomplete dictionary (i.e., the
+sensing matrix is not precisely known), and derive sufficient conditions for
+the reconstruction of shuffled sparse signals. We propose a robust
+reconstruction method that combines sparse signal recovery with robust linear
+regression for the two-channel case. The performance and robustness of the
+proposed approach is illustrated in an application related to whole-brain
+calcium imaging. The proposed methodology can be generalized to sparse signal
+representations other than the ones considered in this work to be applied in a
+variety of real-world problems with imprecise measurement or channel
+assignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to TSP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reducing Training Time in Cross-Silo Federated Learning using Multigraph
+  Topology <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.09657v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.09657v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tuong Do, Binh X. Nguyen, Vuong Pham, Toan Tran, Erman Tjiputra, Quang Tran, Anh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning is an active research topic since it enables several
+participants to jointly train a model without sharing local data. Currently,
+cross-silo federated learning is a popular training setting that utilizes a few
+hundred reliable data silos with high-speed access links to training a model.
+While this approach has been widely applied in real-world scenarios, designing
+a robust topology to reduce the training time remains an open problem. In this
+paper, we present a new multigraph topology for cross-silo federated learning.
+We first construct the multigraph using the overlay graph. We then parse this
+multigraph into different simple graphs with isolated nodes. The existence of
+isolated nodes allows us to perform model aggregation without waiting for other
+nodes, hence effectively reducing the training time. Intensive experiments on
+three public datasets show that our proposed method significantly reduces the
+training time compared with recent state-of-the-art topologies while
+maintaining the accuracy of the learned model. Our code can be found at
+https://github.com/aioz-ai/MultigraphFL
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted in ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BiofilmScanner: A Computational Intelligence Approach to Obtain
+  Bacterial Cell Morphological Attributes from Biofilm Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09629v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09629v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Hafizur Rahman, Md Ali Azam, Md Abir Hossen, Shankarachary Ragi, Venkataramana Gadhamshetty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Desulfovibrio alaskensis G20 (DA-G20) is utilized as a model for
+sulfate-reducing bacteria (SRB) that are associated with corrosion issues
+caused by microorganisms. SRB-based biofilms are thought to be responsible for
+the billion-dollar-per-year bio-corrosion of metal infrastructure.
+Understanding the extraction of the bacterial cells' shape and size properties
+in the SRB-biofilm at different growth stages will assist with the design of
+anti-corrosion techniques. However, numerous issues affect current approaches,
+including time-consuming geometric property extraction, low efficiency, and
+high error rates. This paper proposes BiofilScanner, a Yolact-based deep
+learning method integrated with invariant moments to address these problems.
+Our approach efficiently detects and segments bacterial cells in an SRB image
+while simultaneously invariant moments measure the geometric characteristics of
+the segmented cells with low errors. The numerical experiments of the proposed
+method demonstrate that the BiofilmScanner is 2.1x and 6.8x faster than our
+earlier Mask-RCNN and DLv3+ methods for detecting, segmenting, and measuring
+the geometric properties of the cell. Furthermore, the BiofilmScanner achieved
+an F1-score of 85.28% while Mask-RCNN and DLv3+ obtained F1-scores of 77.67%
+and 75.18%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Pattern Recognition</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Defining data science: a new field of inquiry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16177v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16177v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael L Brodie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data science is not a science. It is a research paradigm. Its power, scope,
+and scale will surpass science, our most powerful research paradigm, to enable
+knowledge discovery and change our world. We have yet to understand and define
+it, vital to realizing its potential and managing its risks. Modern data
+science is in its infancy. Emerging slowly since 1962 and rapidly since 2000,
+it is a fundamentally new field of inquiry, one of the most active, powerful,
+and rapidly evolving 21st century innovations. Due to its value, power, and
+applicability, it is emerging in over 40 disciplines, hundreds of research
+areas, and thousands of applications. Millions of data science publications
+contain myriad definitions of data science and data science problem solving.
+Due to its infancy, many definitions are independent, application specific,
+mutually incomplete, redundant, or inconsistent, hence so is data science. This
+research addresses this data science multiple definitions challenge by
+proposing the development of coherent, unified definition based on a data
+science reference framework using a data science journal for the data science
+community to achieve such a definition. This paper provides candidate
+definitions for essential data science artifacts that are required to discuss
+such a definition. They are based on the classical research paradigm concept
+consisting of a philosophy of data science, the data science problem solving
+paradigm, and the six component data science reference framework (axiology,
+ontology, epistemology, methodology, methods, technology) that is a frequently
+called for unifying framework with which to define, unify, and evolve data
+science. It presents challenges for defining data science, solution approaches,
+i.e., means for defining data science, and their requirements and benefits as
+the basis of a comprehensive solution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeRF-GAN Distillation for Efficient 3D-Aware Generation with
+  Convolutions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12865v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12865v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamad Shahbazi, Evangelos Ntavelis, Alessio Tonioni, Edo Collins, Danda Pani Paudel, Martin Danelljan, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pose-conditioned convolutional generative models struggle with high-quality
+3D-consistent image generation from single-view datasets, due to their lack of
+sufficient 3D priors. Recently, the integration of Neural Radiance Fields
+(NeRFs) and generative models, such as Generative Adversarial Networks (GANs),
+has transformed 3D-aware generation from single-view images. NeRF-GANs exploit
+the strong inductive bias of neural 3D representations and volumetric rendering
+at the cost of higher computational complexity. This study aims at revisiting
+pose-conditioned 2D GANs for efficient 3D-aware generation at inference time by
+distilling 3D knowledge from pretrained NeRF-GANs. We propose a simple and
+effective method, based on re-using the well-disentangled latent space of a
+pre-trained NeRF-GAN in a pose-conditioned convolutional network to directly
+generate 3D-consistent images corresponding to the underlying 3D
+representations. Experiments on several datasets demonstrate that the proposed
+method obtains results comparable with volumetric rendering in terms of quality
+and 3D consistency while benefiting from the computational advantage of
+convolutional networks. The code will be available at:
+https://github.com/mshahbazi72/NeRF-GAN-Distillation
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Typical and atypical solutions in non-convex neural networks with
+  discrete and continuous weights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13871v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13871v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlo Baldassi, Enrico M. Malatesta, Gabriele Perugini, Riccardo Zecchina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the binary and continuous negative-margin perceptrons as simple
+non-convex neural network models learning random rules and associations. We
+analyze the geometry of the landscape of solutions in both models and find
+important similarities and differences. Both models exhibit subdominant
+minimizers which are extremely flat and wide. These minimizers coexist with a
+background of dominant solutions which are composed by an exponential number of
+algorithmically inaccessible small clusters for the binary case (the frozen
+1-RSB phase) or a hierarchical structure of clusters of different sizes for the
+spherical case (the full RSB phase). In both cases, when a certain threshold in
+constraint density is crossed, the local entropy of the wide flat minima
+becomes non-monotonic, indicating a break-up of the space of robust solutions
+into disconnected components. This has a strong impact on the behavior of
+algorithms in binary models, which cannot access the remaining isolated
+clusters. For the spherical case the behaviour is different, since even beyond
+the disappearance of the wide flat minima the remaining solutions are shown to
+always be surrounded by a large number of other solutions at any distance, up
+to capacity. Indeed, we exhibit numerical evidence that algorithms seem to find
+solutions up to the SAT/UNSAT transition, that we compute here using an 1RSB
+approximation. For both models, the generalization performance as a learning
+device is shown to be greatly improved by the existence of wide flat minimizers
+even when trained in the highly underconstrained regime of very negative
+margins.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lipschitz-regularized gradient flows and generative particle algorithms
+  for high-dimensional scarce data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.17230v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.17230v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyemin Gu, Panagiota Birmpa, Yannis Pantazis, Luc Rey-Bellet, Markos A. Katsoulakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We build a new class of generative algorithms capable of efficiently learning
+an arbitrary target distribution from possibly scarce, high-dimensional data
+and subsequently generate new samples. These generative algorithms are
+particle-based and are constructed as gradient flows of Lipschitz-regularized
+Kullback-Leibler or other $f$-divergences, where data from a source
+distribution can be stably transported as particles, towards the vicinity of
+the target distribution. As a highlighted result in data integration, we
+demonstrate that the proposed algorithms correctly transport gene expression
+data points with dimension exceeding 54K, while the sample size is typically
+only in the hundreds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Automated Hemorrhage Detection in Sparse-view Computed
+  Tomography via Deep Convolutional Neural Network based Artifact Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09340v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09340v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johannes Thalhammer, Manuel Schultheiss, Tina Dorosti, Tobias Lasser, Franz Pfeiffer, Daniela Pfeiffer, Florian Schaff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Sparse-view computed tomography (CT) is an effective way to reduce
+dose by lowering the total number of views acquired, albeit at the expense of
+image quality, which, in turn, can impact the ability to detect diseases. We
+explore deep learning-based artifact reduction in sparse-view cranial CT scans
+and its impact on automated hemorrhage detection. Methods: We trained a U-Net
+for artefact reduction on simulated sparse-view cranial CT scans from 3000
+patients obtained from a public dataset and reconstructed with varying levels
+of sub-sampling. Additionally, we trained a convolutional neural network on
+fully sampled CT data from 17,545 patients for automated hemorrhage detection.
+We evaluated the classification performance using the area under the receiver
+operator characteristic curves (AUC-ROCs) with corresponding 95% confidence
+intervals (CIs) and the DeLong test, along with confusion matrices. The
+performance of the U-Net was compared to an analytical approach based on total
+variation (TV). Results: The U-Net performed superior compared to unprocessed
+and TV-processed images with respect to image quality and automated hemorrhage
+diagnosis. With U-Net post-processing, the number of views can be reduced from
+4096 (AUC-ROC: 0.974; 95% CI: 0.972-0.976) views to 512 views (0.973;
+0.971-0.975) with minimal decrease in hemorrhage detection (P<.001) and to 256
+views (0.967; 0.964-0.969) with a slight performance decrease (P<.001).
+Conclusion: The results suggest that U-Net based artifact reduction
+substantially enhances automated hemorrhage detection in sparse-view cranial
+CTs. Our findings highlight that appropriate post-processing is crucial for
+optimal image quality and diagnostic accuracy while minimizing radiation dose.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contrastive Learning and the Emergence of Attributes Associations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.10763v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.10763v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel N. Nissani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In response to an object presentation, supervised learning schemes generally
+respond with a parsimonious label. Upon a similar presentation we humans
+respond again with a label, but are flooded, in addition, by a myriad of
+associations. A significant portion of these consist of the presented object
+attributes. Contrastive learning is a semi-supervised learning scheme based on
+the application of identity preserving transformations on the object input
+representations. It is conjectured in this work that these same applied
+transformations preserve, in addition to the identity of the presented object,
+also the identity of its semantically meaningful attributes. The corollary of
+this is that the output representations of such a contrastive learning scheme
+contain valuable information not only for the classification of the presented
+object, but also for the presence or absence decision of any attribute of
+interest. Simulation results which demonstrate this idea and the feasibility of
+this conjecture are presented.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Active Learning of Discrete-Time Dynamics for Uncertainty-Aware Model
+  Predictive Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.12583v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.12583v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Saviolo, Jonathan Frey, Abhishek Rathod, Moritz Diehl, Giuseppe Loianno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model-based control requires an accurate model of the system dynamics for
+precisely and safely controlling the robot in complex and dynamic environments.
+Moreover, in the presence of variations in the operating conditions, the model
+should be continuously refined to compensate for dynamics changes. In this
+paper, we present a self-supervised learning approach that actively models the
+dynamics of nonlinear robotic systems. We combine offline learning from past
+experience and online learning from current robot interaction with the unknown
+environment. These two ingredients enable a highly sample-efficient and
+adaptive learning process, capable of accurately inferring model dynamics in
+real-time even in operating regimes that greatly differ from the training
+distribution. Moreover, we design an uncertainty-aware model predictive
+controller that is heuristically conditioned to the aleatoric (data)
+uncertainty of the learned dynamics. This controller actively chooses the
+optimal control actions that (i) optimize the control performance and (ii)
+improve the efficiency of online learning sample collection. We demonstrate the
+effectiveness of our method through a series of challenging real-world
+experiments using a quadrotor system. Our approach showcases high resilience
+and generalization capabilities by consistently adapting to unseen flight
+conditions, while it significantly outperforms classical and adaptive control
+baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Predicting protein variants with equivariant graph neural networks <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12231v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12231v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonia Boca, Simon Mathis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained models have been successful in many protein engineering tasks.
+Most notably, sequence-based models have achieved state-of-the-art performance
+on protein fitness prediction while structure-based models have been used
+experimentally to develop proteins with enhanced functions. However, there is a
+research gap in comparing structure- and sequence-based methods for predicting
+protein variants that are better than the wildtype protein. This paper aims to
+address this gap by conducting a comparative study between the abilities of
+equivariant graph neural networks (EGNNs) and sequence-based approaches to
+identify promising amino-acid mutations. The results show that our proposed
+structural approach achieves a competitive performance to sequence-based
+methods while being trained on significantly fewer molecules. Additionally, we
+find that combining assay labelled data with structure pre-trained models
+yields similar trends as with sequence pre-trained models.
+  Our code and trained models can be found at:
+https://github.com/semiluna/partIII-amino-acid-prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 2 figures, accepted to the 2023 ICML Workshop on
+  Computational Biology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty-inspired Open Set Learning for Retinal Anomaly
+  Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03981v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03981v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng Wang, Tian Lin, Lianyu Wang, Aidi Lin, Ke Zou, Xinxing Xu, Yi Zhou, Yuanyuan Peng, Qingquan Meng, Yiming Qian, Guoyao Deng, Zhiqun Wu, Junhong Chen, Jianhong Lin, Mingzhi Zhang, Weifang Zhu, Changqing Zhang, Daoqiang Zhang, Rick Siow Mong Goh, Yong Liu, Chi Pui Pang, Xinjian Chen, Haoyu Chen, Huazhu Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Failure to recognize samples from the classes unseen during training is a
+major limitation of artificial intelligence in the real-world implementation
+for recognition and classification of retinal anomalies. We established an
+uncertainty-inspired open-set (UIOS) model, which was trained with fundus
+images of 9 retinal conditions. Besides assessing the probability of each
+category, UIOS also calculated an uncertainty score to express its confidence.
+Our UIOS model with thresholding strategy achieved an F1 score of 99.55%,
+97.01% and 91.91% for the internal testing set, external target categories
+(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1
+score of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS
+correctly predicted high uncertainty scores, which would prompt the need for a
+manual check in the datasets of non-target categories retinal diseases,
+low-quality fundus images, and non-fundus images. UIOS provides a robust method
+for real-world screening of retinal anomalies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Minimax Optimal Kernel Operator Learning via Multilevel Training <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.14430v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.14430v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jikai Jin, Yiping Lu, Jose Blanchet, Lexing Ying
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning mappings between infinite-dimensional function spaces has achieved
+empirical success in many disciplines of machine learning, including generative
+modeling, functional data analysis, causal inference, and multi-agent
+reinforcement learning. In this paper, we study the statistical limit of
+learning a Hilbert-Schmidt operator between two infinite-dimensional Sobolev
+reproducing kernel Hilbert spaces. We establish the information-theoretic lower
+bound in terms of the Sobolev Hilbert-Schmidt norm and show that a
+regularization that learns the spectral components below the bias contour and
+ignores the ones that are above the variance contour can achieve the optimal
+learning rate. At the same time, the spectral components between the bias and
+variance contours give us flexibility in designing computationally feasible
+machine learning algorithms. Based on this observation, we develop a multilevel
+kernel operator learning algorithm that is optimal when learning linear
+operators between infinite-dimensional function spaces.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2023 spotlight</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Does Circuit Analysis Interpretability Scale? Evidence from Multiple
+  Choice Capabilities in Chinchilla 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09458v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09458v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Lieberum, Matthew Rahtz, János Kramár, Neel Nanda, Geoffrey Irving, Rohin Shah, Vladimir Mikulik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  \emph{Circuit analysis} is a promising technique for understanding the
+internal mechanisms of language models. However, existing analyses are done in
+small models far from the state of the art. To address this, we present a case
+study of circuit analysis in the 70B Chinchilla model, aiming to test the
+scalability of circuit analysis. In particular, we study multiple-choice
+question answering, and investigate Chinchilla's capability to identify the
+correct answer \emph{label} given knowledge of the correct answer \emph{text}.
+We find that the existing techniques of logit attribution, attention pattern
+visualization, and activation patching naturally scale to Chinchilla, allowing
+us to identify and categorize a small set of `output nodes' (attention heads
+and MLPs).
+  We further study the `correct letter' category of attention heads aiming to
+understand the semantics of their features, with mixed results. For normal
+multiple-choice question answers, we significantly compress the query, key and
+value subspaces of the head without loss of performance when operating on the
+answer labels for multiple-choice questions, and we show that the query and key
+subspaces represent an `Nth item in an enumeration' feature to at least some
+extent. However, when we attempt to use this explanation to understand the
+heads' behaviour on a more general distribution including randomized answer
+labels, we find that it is only a partial explanation, suggesting there is more
+to learn about the operation of `correct letter' heads on multiple choice
+question answering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ S3M: Scalable Statistical Shape Modeling through Unsupervised
+  Correspondences <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07515v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07515v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lennart Bastian, Alexander Baumann, Emily Hoppe, Vincent Bürgin, Ha Young Kim, Mahdi Saleh, Benjamin Busam, Nassir Navab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Statistical shape models (SSMs) are an established way to represent the
+anatomy of a population with various clinically relevant applications. However,
+they typically require domain expertise, and labor-intensive landmark
+annotations to construct. We address these shortcomings by proposing an
+unsupervised method that leverages deep geometric features and functional
+correspondences to simultaneously learn local and global shape structures
+across population anatomies. Our pipeline significantly improves unsupervised
+correspondence estimation for SSMs compared to baseline methods, even on highly
+irregular surface topologies. We demonstrate this for two different anatomical
+structures: the thyroid and a multi-chamber heart dataset. Furthermore, our
+method is robust enough to learn from noisy neural network predictions,
+potentially enabling scaling SSMs to larger patient populations without manual
+segmentation annotation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023. 13 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep learning based Meta-modeling for Multi-objective Technology
+  Optimization of Electrical Machines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09087v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09087v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivek Parekh, Dominik Flore, Sebastian Schöps
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimization of rotating electrical machines is both time- and
+computationally expensive. Because of the different parametrization, design
+optimization is commonly executed separately for each machine technology. In
+this paper, we present the application of a variational auto-encoder (VAE) to
+optimize two different machine technologies simultaneously, namely an
+asynchronous machine and a permanent magnet synchronous machine. After
+training, we employ a deep neural network and a decoder as meta-models to
+predict global key performance indicators (KPIs) and generate associated new
+designs, respectively, through unified latent space in the optimization loop.
+Numerical results demonstrate concurrent parametric multi-objective technology
+optimization in the high-dimensional design space. The VAE-based approach is
+quantitatively compared to a classical deep learning-based direct approach for
+KPIs prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Unfolded Simulated Bifurcation for Massive MIMO Signal Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16264v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16264v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satoshi Takabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple-input multiple-output (MIMO) is a key ingredient of next-generation
+wireless communications. Recently, various MIMO signal detectors based on deep
+learning techniques and quantum(-inspired) algorithms have been proposed to
+improve the detection performance compared with conventional detectors. This
+paper focuses on the simulated bifurcation (SB) algorithm, a quantum-inspired
+algorithm. This paper proposes two techniques to improve its detection
+performance. The first is modifying the algorithm inspired by the
+Levenberg-Marquardt algorithm to eliminate local minima of maximum likelihood
+detection. The second is the use of deep unfolding, a deep learning technique
+to train the internal parameters of an iterative algorithm. We propose a
+deep-unfolded SB by making the update rule of SB differentiable. The numerical
+results show that these proposed detectors significantly improve the signal
+detection performance in massive MIMO systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5pages, 4 figures; codes are available at
+  https://github.com/s-takabe/unfolded_simbif</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning-Augmented B-Trees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.09251v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.09251v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyuan Cao, Jingbang Chen, Li Chen, Chris Lambert, Richard Peng, Daniel Sleator
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study learning-augmented binary search trees (BSTs) and B-Trees via Treaps
+with composite priorities. The result is a simple search tree where the depth
+of each item is determined by its predicted weight $w_x$. To achieve the
+result, each item $x$ has its composite priority
+$-\lfloor\log\log(1/w_x)\rfloor + U(0, 1)$ where $U(0, 1)$ is the uniform
+random variable. This generalizes the recent learning-augmented BSTs
+[Lin-Luo-Woodruff ICML`22], which only work for Zipfian distributions, to
+arbitrary inputs and predictions. It also gives the first B-Tree data structure
+that can provably take advantage of localities in the access sequence via
+online self-reorganization. The data structure is robust to prediction errors
+and handles insertions, deletions, as well as prediction updates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unmasking Falsehoods in <span class="highlight-title">Review</span>s: An Exploration of NLP Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10617v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10617v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anusuya Baby Hari Krishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the contemporary digital landscape, online reviews have become an
+indispensable tool for promoting products and services across various
+businesses. Marketers, advertisers, and online businesses have found incentives
+to create deceptive positive reviews for their products and negative reviews
+for their competitors' offerings. As a result, the writing of deceptive reviews
+has become an unavoidable practice for businesses seeking to promote themselves
+or undermine their rivals. Detecting such deceptive reviews has become an
+intense and ongoing area of research. This research paper proposes a machine
+learning model to identify deceptive reviews, with a particular focus on
+restaurants. This study delves into the performance of numerous experiments
+conducted on a dataset of restaurant reviews known as the Deceptive Opinion
+Spam Corpus. To accomplish this, an n-gram model and max features are developed
+to effectively identify deceptive content, particularly focusing on fake
+reviews. A benchmark study is undertaken to explore the performance of two
+different feature extraction techniques, which are then coupled with five
+distinct machine learning classification algorithms. The experimental results
+reveal that the passive aggressive classifier stands out among the various
+algorithms, showcasing the highest accuracy not only in text classification but
+also in identifying fake reviews. Moreover, the research delves into data
+augmentation and implements various deep learning techniques to further enhance
+the process of detecting deceptive reviews. The findings shed light on the
+efficacy of the proposed machine learning approach and offer valuable insights
+into dealing with deceptive reviews in the realm of online businesses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic MPC for energy hubs using data driven demand forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12438v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12438v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Varsha Behrunani, Francesco Micheli, Jonas Mehr, Philipp Heer, John Lygeros
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Energy hubs convert and distribute energy resources by combining different
+energy inputs through multiple conversion and storage components. The optimal
+operation of the energy hub exploits its flexibility to increase the energy
+efficiency and reduce the operational costs. However, uncertainties in the
+demand present challenges to energy hub optimization. In this paper, we propose
+a stochastic MPC controller to minimize energy costs using chance constraints
+for the uncertain electricity and thermal demands. Historical data is used to
+build a demand prediction model based on Gaussian processes to generate a
+forecast of the future electricity and heat demands. The stochastic
+optimization problem is solved via the Scenario Approach by sampling multi-step
+demand trajectories from the derived prediction model. The performance of the
+proposed predictor and of the stochastic controller is verified on a simulated
+energy hub model and demand data from a real building.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures. Submitted to IFAC World Congress 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Style Classification of Rabbinic Literature for Detection of Lost
+  Midrash Tanhuma Material 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.09710v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.09710v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shlomo Tannor, Nachum Dershowitz, Moshe Lavee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Midrash collections are complex rabbinic works that consist of text in
+multiple languages, which evolved through long processes of unstable oral and
+written transmission. Determining the origin of a given passage in such a
+compilation is not always straightforward and is often a matter of dispute
+among scholars, yet it is essential for scholars' understanding of the passage
+and its relationship to other texts in the rabbinic corpus. To help solve this
+problem, we propose a system for classification of rabbinic literature based on
+its style, leveraging recent advances in natural language processing for Hebrew
+texts. Additionally, we demonstrate how this method can be applied to uncover
+lost material from a specific midrash genre, Tan\d{h}uma-Yelammedenu, that has
+been preserved in later anthologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RED-PSM: Regularization by Denoising of Partially Separable Models for
+  Dynamic Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03483v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03483v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Berk Iskender, Marc L. Klasky, Yoram Bresler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic imaging addresses the recovery of a time-varying 2D or 3D object at
+each time instant using its undersampled measurements. In particular, in the
+case of dynamic tomography, only a single projection at a single view angle may
+be available at a time, making the problem severely ill-posed. In this work, we
+propose an approach, RED-PSM, which combines for the first time two powerful
+techniques to address this challenging imaging problem. The first, are
+partially separable models, which have been used to efficiently introduce a
+low-rank prior for the spatio-temporal object. The second is the recent
+Regularization by Denoising (RED), which provides a flexible framework to
+exploit the impressive performance of state-of-the-art image denoising
+algorithms, for various inverse problems. We propose a partially separable
+objective with RED and a computationally efficient and scalable optimization
+scheme with variable splitting and ADMM. Theoretical analysis proves the
+convergence of our objective to a value corresponding to a stationary point
+satisfying the first-order optimality conditions. Convergence is accelerated by
+a particular projection-domain-based initialization. We demonstrate the
+performance and computational improvements of our proposed RED-PSM with a
+learned image denoiser by comparing it to a recent deep-prior-based method
+known as TD-DIP. Although the main focus is on dynamic tomography, we also show
+the performance advantages of RED-PSM in a cardiac dynamic MRI setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nexus sine qua non: Essentially Connected Networks for Traffic
+  Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01482v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01482v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Nie, Guoyang Qin, Yunpeng Wang, Jian Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial-temporal graph neural networks (STGNNs) have become the de facto
+models for learning spatiotemporal representations of traffic flow. However,
+modern STGNNs often contain superfluous or obscure components, along with
+complex techniques, posing significant challenges in terms of complexity and
+scalability. Such concerns prompt us to rethink the design of neural
+architectures and to identify the key challenges in traffic forecasting as
+spatial-temporal contextualization. Here, we present an essentially connected
+model based on an efficient message-passing backbone, powered by learnable node
+embedding, without any complex sequential techniques such as TCNs, RNNs, and
+Transformers. Intriguingly, empirical results demonstrate how a simple and
+elegant model with contextualization capability compares favorably w.r.t. the
+state-of-the-art with elaborate structures, while being much more interpretable
+and computationally efficient for traffic forecasting. We anticipate that our
+findings will open new horizons for further research to explore the possibility
+of creating simple but effective neural forecasting architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Choosing Well Your Opponents: How to Guide the Synthesis of Programmatic
+  Strategies <span class="chip">IJCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04893v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04893v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rubens O. Moraes, David S. Aleixo, Lucas N. Ferreira, Levi H. S. Lelis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Local Learner (2L), an algorithm for providing a set of
+reference strategies to guide the search for programmatic strategies in
+two-player zero-sum games. Previous learning algorithms, such as Iterated Best
+Response (IBR), Fictitious Play (FP), and Double-Oracle (DO), can be
+computationally expensive or miss important information for guiding search
+algorithms. 2L actively selects a set of reference strategies to improve the
+search signal. We empirically demonstrate the advantages of our approach while
+guiding a local search algorithm for synthesizing strategies in three games,
+including MicroRTS, a challenging real-time strategy game. Results show that 2L
+learns reference strategies that provide a stronger search signal than IBR, FP,
+and DO. We also simulate a tournament of MicroRTS, where a synthesizer using 2L
+outperformed the winners of the two latest MicroRTS competitions, which were
+programmatic strategies written by human programmers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Joint Conference on Artificial Intelligence (IJCAI)
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physics-aware Graph Neural Network for Accurate RNA 3D Structure
+  Prediction <span class="chip">NeurIPS 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.16392v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.16392v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Zhang, Yang Liu, Lei Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biological functions of RNAs are determined by their three-dimensional (3D)
+structures. Thus, given the limited number of experimentally determined RNA
+structures, the prediction of RNA structures will facilitate elucidating RNA
+functions and RNA-targeted drug discovery, but remains a challenging task. In
+this work, we propose a Graph Neural Network (GNN)-based scoring function
+trained only with the atomic types and coordinates on limited solved RNA 3D
+structures for distinguishing accurate structural models. The proposed
+Physics-aware Multiplex Graph Neural Network (PaxNet) separately models the
+local and non-local interactions inspired by molecular mechanics. Furthermore,
+PaxNet contains an attention-based fusion module that learns the individual
+contribution of each interaction type for the final prediction. We rigorously
+evaluate the performance of PaxNet on two benchmarks and compare it with
+several state-of-the-art baselines. The results show that PaxNet significantly
+outperforms all the baselines overall, and demonstrate the potential of PaxNet
+for improving the 3D structure modeling of RNA and other macromolecules. Our
+code is available at https://github.com/zetayue/Physics-aware-Multiplex-GNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the Machine Learning for Structural Biology Workshop
+  (MLSB) at the 36th Conference on Neural Information Processing Systems
+  (NeurIPS 2022)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A$^2$-UAV: Application-Aware Content and Network Optimization of
+  Edge-Assisted UAV Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.06363v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.06363v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Coletta, Flavio Giorgi, Gaia Maselli, Matteo Prata, Domenicomichele Silvestri, Jonathan Ashdown, Francesco Restuccia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To perform advanced surveillance, Unmanned Aerial Vehicles (UAVs) require the
+execution of edge-assisted computer vision (CV) tasks. In multi-hop UAV
+networks, the successful transmission of these tasks to the edge is severely
+challenged due to severe bandwidth constraints. For this reason, we propose a
+novel A$^2$-UAV framework to optimize the number of correctly executed tasks at
+the edge. In stark contrast with existing art, we take an application-aware
+approach and formulate a novel pplication-Aware Task Planning Problem
+(A$^2$-TPP) that takes into account (i) the relationship between deep neural
+network (DNN) accuracy and image compression for the classes of interest based
+on the available dataset, (ii) the target positions, (iii) the current
+energy/position of the UAVs to optimize routing, data pre-processing and target
+assignment for each UAV. We demonstrate A$^2$-TPP is NP-Hard and propose a
+polynomial-time algorithm to solve it efficiently. We extensively evaluate
+A$^2$-UAV through real-world experiments with a testbed composed by four DJI
+Mavic Air 2 UAVs. We consider state-of-the-art image classification tasks with
+four different DNN models (i.e., DenseNet, ResNet152, ResNet50 and
+MobileNet-V2) and object detection tasks using YoloV4 trained on the ImageNet
+dataset. Results show that A$^2$-UAV attains on average around 38% more
+accomplished tasks than the state-of-the-art, with 400% more accomplished tasks
+when the number of targets increases significantly. To allow full
+reproducibility, we pledge to share datasets and code with the research
+community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to INFOCOM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Non-stationary Linear Bandits for Large-Scale Recommender
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.03167v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.03167v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saeed Ghoorchian, Evgenii Kortukov, Setareh Maghsudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Taking advantage of contextual information can potentially boost the
+performance of recommender systems. In the era of big data, such side
+information often has several dimensions. Thus, developing decision-making
+algorithms to cope with such a high-dimensional context in real time is
+essential. That is specifically challenging when the decision-maker has a
+variety of items to recommend. In addition, changes in items' popularity or
+users' preferences can hinder the performance of the deployed recommender
+system due to a lack of robustness to distribution shifts in the environment.
+In this paper, we build upon the linear contextual multi-armed bandit framework
+to address this problem. We develop a decision-making policy for a linear
+bandit problem with high-dimensional feature vectors, a large set of arms, and
+non-stationary reward-generating processes. Our Thompson sampling-based policy
+reduces the dimension of feature vectors using random projection and uses
+exponentially increasing weights to decrease the influence of past observations
+with time. Our proposed recommender system employs this policy to learn the
+users' item preferences online while minimizing runtime. We prove a regret
+bound that scales as a factor of the reduced dimension instead of the original
+one. To evaluate our proposed recommender system numerically, we apply it to
+three real-world datasets. The theoretical and numerical results demonstrate
+the effectiveness of our proposed algorithm in making a trade-off between
+computational complexity and regret performance compared to the
+state-of-the-art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tensor and Matrix Low-Rank Value-Function Approximation in Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.09736v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.09736v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergio Rozada, Santiago Paternain, Antonio G. Marques
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Value-function (VF) approximation is a central problem in Reinforcement
+Learning (RL). Classical non-parametric VF estimation suffers from the curse of
+dimensionality. As a result, parsimonious parametric models have been adopted
+to approximate VFs in high-dimensional spaces, with most efforts being focused
+on linear and neural-network-based approaches. Differently, this paper puts
+forth a a \emph{parsimonious non-parametric} approach, where we use
+\emph{stochastic low-rank algorithms} to estimate the VF matrix in an online
+and model-free fashion. Furthermore, as VFs tend to be multi-dimensional, we
+propose replacing the classical VF matrix representation with a tensor
+(multi-way array) representation and, then, use the PARAFAC decomposition to
+design an online model-free tensor low-rank algorithm. Different versions of
+the algorithms are proposed, their complexity is analyzed, and their
+performance is assessed numerically using standardized RL environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures, 2 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Estimation of Generic Dynamics by Path-Dependent Neural Jump
+  ODEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.14284v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.14284v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Krach, Marc Nübel, Josef Teichmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the problem of forecasting general stochastic processes
+using a path-dependent extension of the Neural Jump ODE (NJ-ODE) framework.
+While NJ-ODE was the first framework to establish convergence guarantees for
+the prediction of irregularly observed time series, these results were limited
+to data stemming from It\^o-diffusions with complete observations, in
+particular Markov processes where all coordinates are observed simultaneously.
+In this work, we generalise these results to generic, possibly non-Markovian or
+discontinuous, stochastic processes with incomplete observations, by utilising
+the reconstruction properties of the signature transform. These theoretical
+results are supported by empirical studies, where it is shown that the
+path-dependent NJ-ODE outperforms the original NJ-ODE framework in the case of
+non-Markovian data. Moreover, we show that PD-NJ-ODE can be applied
+successfully to classical stochastic filtering problems and to limit order book
+(LOB) data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Active Membership Inference Attack under Local Differential Privacy in
+  Federated Learning <span class="chip">AISTATS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12685v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12685v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Truc Nguyen, Phung Lai, Khang Tran, NhatHai Phan, My T. Thai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) was originally regarded as a framework for
+collaborative learning among clients with data privacy protection through a
+coordinating server. In this paper, we propose a new active membership
+inference (AMI) attack carried out by a dishonest server in FL. In AMI attacks,
+the server crafts and embeds malicious parameters into global models to
+effectively infer whether a target data sample is included in a client's
+private training data or not. By exploiting the correlation among data features
+through a non-linear decision boundary, AMI attacks with a certified guarantee
+of success can achieve severely high success rates under rigorous local
+differential privacy (LDP) protection; thereby exposing clients' training data
+to significant privacy risk. Theoretical and experimental results on several
+benchmark datasets show that adding sufficient privacy-preserving noise to
+prevent our attack would significantly damage FL's model utility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at AISTATS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Synthetic Control Methods by Density Matching under Implicit Endogeneity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11127v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11127v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masahiro Kato, Akari Ohda, Masaaki Imaizumi, Kenichiro McAlinn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthetic control methods (SCMs) have become a crucial tool for causal
+inference in comparative case studies. The fundamental idea of SCMs is to
+estimate counterfactual outcomes for a treated unit by using a weighted sum of
+observed outcomes from untreated units. The accuracy of the synthetic control
+(SC) is critical for estimating the causal effect, and hence, the estimation of
+SC weights has been the focus of much research. In this paper, we first point
+out that existing SCMs suffer from an implicit endogeneity problem, which is
+the correlation between the outcomes of untreated units and the error term in
+the model of a counterfactual outcome. We show that this problem yields a bias
+in the causal effect estimator. We then propose a novel SCM based on density
+matching, assuming that the density of outcomes of the treated unit can be
+approximated by a weighted average of the densities of untreated units (i.e., a
+mixture model). Based on this assumption, we estimate SC weights by matching
+moments of treated outcomes and the weighted sum of moments of untreated
+outcomes. Our proposed method has three advantages over existing methods.
+First, our estimator is asymptotically unbiased under the assumption of the
+mixture model. Second, due to the asymptotic unbiasedness, we can reduce the
+mean squared error for counterfactual prediction. Third, our method generates
+full densities of the treatment effect, not only expected values, which
+broadens the applicability of SCMs. We provide experimental results to
+demonstrate the effectiveness of our proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Classification of Consumer Belief Statements From Social Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.15498v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.15498v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, Wenbin Le, Hannah Danner, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media offer plenty of information to perform market research in order
+to meet the requirements of customers. One way how this research is conducted
+is that a domain expert gathers and categorizes user-generated content into a
+complex and fine-grained class structure. In many of such cases, little data
+meets complex annotations. It is not yet fully understood how this can be
+leveraged successfully for classification. We examine the classification
+accuracy of expert labels when used with a) many fine-grained classes and b)
+few abstract classes. For scenario b) we compare abstract class labels given by
+the domain expert as baseline and by automatic hierarchical clustering. We
+compare this to another baseline where the entire class structure is given by a
+completely unsupervised clustering approach. By doing so, this work can serve
+as an example of how complex expert annotations are potentially beneficial and
+can be utilized in the most optimal way for opinion mining in highly specific
+domains. By exploring across a range of techniques and experiments, we find
+that automated class abstraction approaches in particular the unsupervised
+approach performs remarkably well against domain expert baseline on text
+classification tasks. This has the potential to inspire opinion mining
+applications in order to support market researchers in practice and to inspire
+fine-grained automated content analysis on a large scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ End-to-End Annotator Bias Approximation on Crowdsourced Single-Label
+  Sentiment Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.02326v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.02326v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerhard Johann Hagerer, David Szabo, Andreas Koch, Maria Luisa Ripoll Dominguez, Christian Widmer, Maximilian Wich, Hannah Danner, Georg Groh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sentiment analysis is often a crowdsourcing task prone to subjective labels
+given by many annotators. It is not yet fully understood how the annotation
+bias of each annotator can be modeled correctly with state-of-the-art methods.
+However, resolving annotator bias precisely and reliably is the key to
+understand annotators' labeling behavior and to successfully resolve
+corresponding individual misconceptions and wrongdoings regarding the
+annotation task. Our contribution is an explanation and improvement for precise
+neural end-to-end bias modeling and ground truth estimation, which reduces an
+undesired mismatch in that regard of the existing state-of-the-art.
+Classification experiments show that it has potential to improve accuracy in
+cases where each sample is annotated only by one single annotator. We provide
+the whole source code publicly and release an own domain-specific sentiment
+dataset containing 10,000 sentences discussing organic food products. These are
+crawled from social media and are singly labeled by 10 non-expert annotators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, 2 tables, full conference paper, peer-reviewed</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Long-Tail Theory under Gaussian Mixtures <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10736v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10736v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arman Bolatov, Maxat Tezekbayev, Igor Melnykov, Artur Pak, Vassilina Nikoulina, Zhenisbek Assylbekov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We suggest a simple Gaussian mixture model for data generation that complies
+with Feldman's long tail theory (2020). We demonstrate that a linear classifier
+cannot decrease the generalization error below a certain level in the proposed
+model, whereas a nonlinear classifier with a memorization capacity can. This
+confirms that for long-tailed distributions, rare training examples must be
+considered for optimal generalization to new data. Finally, we show that the
+performance gap between linear and nonlinear models can be lessened as the tail
+becomes shorter in the subpopulation frequency distribution, as confirmed by
+experiments on synthetic and real data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Bayesian approach to quantifying uncertainties and improving
+  generalizability in traffic prediction models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05946v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05946v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Agnimitra Sengupta, Sudeepta Mondal, Adway Das, S. Ilgin Guler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep-learning models for traffic data prediction can have superior
+performance in modeling complex functions using a multi-layer architecture.
+However, a major drawback of these approaches is that most of these approaches
+do not offer forecasts with uncertainty estimates, which are essential for
+traffic operations and control. Without uncertainty estimates, it is difficult
+to place any level of trust to the model predictions, and operational
+strategies relying on overconfident predictions can lead to worsening traffic
+conditions. In this study, we propose a Bayesian recurrent neural network
+framework for uncertainty quantification in traffic prediction with higher
+generalizability by introducing spectral normalization to its hidden layers. In
+our paper, we have shown that normalization alters the training process of deep
+neural networks by controlling the model's complexity and reducing the risk of
+overfitting to the training data. This, in turn, helps improve the
+generalization performance of the model on out-of-distribution datasets.
+Results demonstrate that spectral normalization improves uncertainty estimates
+and significantly outperforms both the layer normalization and model without
+normalization in single-step prediction horizons. This improved performance can
+be attributed to the ability of spectral normalization to better localize the
+feature space of the data under perturbations. Our findings are especially
+relevant to traffic management applications, where predicting traffic
+conditions across multiple locations is the goal, but the availability of
+training data from multiple locations is limited. Spectral normalization,
+therefore, provides a more generalizable approach that can effectively capture
+the underlying patterns in traffic data without requiring location-specific
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DeepMem: ML Models as storage channels and their (mis-)applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08811v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08811v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Abdullah Al Mamun, Quazi Mishkatul Alam, Erfan Shaigani, Pedram Zaree, Ihsen Alouani, Nael Abu-Ghazaleh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML) models are overparameterized to support generality and
+avoid overfitting. Prior works have shown that these additional parameters can
+be used for both malicious (e.g., hiding a model covertly within a trained
+model) and beneficial purposes (e.g., watermarking a model). In this paper, we
+propose a novel information theoretic perspective of the problem; we consider
+the ML model as a storage channel with a capacity that increases with
+overparameterization. Specifically, we consider a sender that embeds arbitrary
+information in the model at training time, which can be extracted by a receiver
+with a black-box access to the deployed model. We derive an upper bound on the
+capacity of the channel based on the number of available parameters. We then
+explore black-box write and read primitives that allow the attacker to: (i)
+store data in an optimized way within the model by augmenting the training data
+at the transmitter side, and (ii) to read it by querying the model after it is
+deployed. We also analyze the detectability of the writing primitive and
+consider a new version of the problem which takes information storage
+covertness into account. Specifically, to obtain storage covertness, we
+introduce a new constraint such that the data augmentation used for the write
+primitives minimizes the distribution shift with the initial (baseline task)
+distribution. This constraint introduces a level of "interference" with the
+initial task, thereby limiting the channel's effective capacity. Therefore, we
+develop optimizations to improve the capacity in this case, including a novel
+ML-specific substitution based error correction protocol. We believe that the
+proposed modeling of the problem offers new tools to better understand and
+mitigate potential vulnerabilities of ML, especially in the context of
+increasingly large models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Plugin estimators for selective classification with out-of-distribution
+  detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.12386v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.12386v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harikrishna Narasimhan, Aditya Krishna Menon, Wittawat Jitkrittum, Sanjiv Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world classifiers can benefit from the option of abstaining from
+predicting on samples where they have low confidence. Such abstention is
+particularly useful on samples which are close to the learned decision
+boundary, or which are outliers with respect to the training sample. These
+settings have been the subject of extensive but disjoint study in the selective
+classification (SC) and out-of-distribution (OOD) detection literature. Recent
+work on selective classification with OOD detection (SCOD) has argued for the
+unified study of these problems; however, the formal underpinnings of this
+problem are still nascent, and existing techniques are heuristic in nature. In
+this paper, we propose new plugin estimators for SCOD that are theoretically
+grounded, effective, and generalise existing approaches from the SC and OOD
+detection literature. In the course of our analysis, we formally explicate how
+na\"{i}ve use of existing SC and OOD detection baselines may be inadequate for
+SCOD. We empirically demonstrate that our approaches yields competitive SC and
+OOD detection performance compared to baselines from both literatures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Simple Regret in Bayesian Best Arm Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.09885v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.09885v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junpei Komiyama, Kaito Ariu, Masahiro Kato, Chao Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider best arm identification in the multi-armed bandit problem.
+Assuming certain continuity conditions of the prior, we characterize the rate
+of the Bayesian simple regret. Differing from Bayesian regret minimization
+(Lai, 1987), the leading term in the Bayesian simple regret derives from the
+region where the gap between optimal and suboptimal arms is smaller than
+$\sqrt{\frac{\log T}{T}}$. We propose a simple and easy-to-compute algorithm
+with its leading term matching with the lower bound up to a constant factor;
+simulation results support our theoretical findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to appear in Mathematics of Operations Research</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fourier-DeepONet: Fourier-enhanced deep operator networks for full
+  waveform inversion with improved accuracy, generalizability, and robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17289v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17289v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Min Zhu, Shihang Feng, Youzuo Lin, Lu Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Full waveform inversion (FWI) infers the subsurface structure information
+from seismic waveform data by solving a non-convex optimization problem.
+Data-driven FWI has been increasingly studied with various neural network
+architectures to improve accuracy and computational efficiency. Nevertheless,
+the applicability of pre-trained neural networks is severely restricted by
+potential discrepancies between the source function used in the field survey
+and the one utilized during training. Here, we develop a Fourier-enhanced deep
+operator network (Fourier-DeepONet) for FWI with the generalization of seismic
+sources, including the frequencies and locations of sources. Specifically, we
+employ the Fourier neural operator as the decoder of DeepONet, and we utilize
+source parameters as one input of Fourier-DeepONet, facilitating the resolution
+of FWI with variable sources. To test Fourier-DeepONet, we develop three new
+and realistic FWI benchmark datasets (FWI-F, FWI-L, and FWI-FL) with varying
+source frequencies, locations, or both. Our experiments demonstrate that
+compared with existing data-driven FWI methods, Fourier-DeepONet obtains more
+accurate predictions of subsurface structures in a wide range of source
+parameters. Moreover, the proposed Fourier-DeepONet exhibits superior
+robustness when handling data with Gaussian noise or missing traces and sources
+with Gaussian noise, paving the way for more reliable and accurate subsurface
+imaging across diverse real conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Similarity search in the blink of an eye with compressed indices <span class="chip">VLDB 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04759v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04759v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cecilia Aguerrebere, Ishwar Bhati, Mark Hildebrand, Mariano Tepper, Ted Willke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, data is represented by vectors. Retrieving those vectors, among
+millions and billions, that are similar to a given query is a ubiquitous
+problem, known as similarity search, of relevance for a wide range of
+applications. Graph-based indices are currently the best performing techniques
+for billion-scale similarity search. However, their random-access memory
+pattern presents challenges to realize their full potential. In this work, we
+present new techniques and systems for creating faster and smaller graph-based
+indices. To this end, we introduce a novel vector compression method,
+Locally-adaptive Vector Quantization (LVQ), that uses per-vector scaling and
+scalar quantization to improve search performance with fast similarity
+computations and a reduced effective bandwidth, while decreasing memory
+footprint and barely impacting accuracy. LVQ, when combined with a new
+high-performance computing system for graph-based similarity search,
+establishes the new state of the art in terms of performance and memory
+footprint. For billions of vectors, LVQ outcompetes the second-best
+alternatives: (1) in the low-memory regime, by up to 20.7x in throughput with
+up to a 3x memory footprint reduction, and (2) in the high-throughput regime by
+5.8x with 1.4x less memory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>VLDB 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-07-23T00:00:00Z">2023-07-23</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">26</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Effectiveness of Offline RL for Dialogue Response Generation <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paloma Sodhi, Felix Wu, Ethan R. Elenberg, Kilian Q. Weinberger, Ryan McDonald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common training technique for language models is teacher forcing (TF). TF
+attempts to match human language exactly, even though identical meanings can be
+expressed in different ways. This motivates use of sequence-level objectives
+for dialogue response generation. In this paper, we study the efficacy of
+various offline reinforcement learning (RL) methods to maximize such
+objectives. We present a comprehensive evaluation across multiple datasets,
+models, and metrics. Offline RL shows a clear performance improvement over
+teacher forcing while not inducing training instability or sacrificing
+practical training budgets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2023. 18 pages, 12 figures. Code available at
+  https://github.com/asappresearch/dialogue-offline-rl</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Testing Hateful Speeches against Policies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12418v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12418v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiangrui Zheng, Xueqing Liu, Girish Budhrani, Wei Yang, Ravishka Rathnasuriya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the recent years, many software systems have adopted AI techniques,
+especially deep learning techniques. Due to their black-box nature, AI-based
+systems brought challenges to traceability, because AI system behaviors are
+based on models and data, whereas the requirements or policies are rules in the
+form of natural or programming language. To the best of our knowledge, there is
+a limited amount of studies on how AI and deep neural network-based systems
+behave against rule-based requirements/policies. This experience paper examines
+deep neural network behaviors against rule-based requirements described in
+natural language policies. In particular, we focus on a case study to check
+AI-based content moderation software against content moderation policies.
+First, using crowdsourcing, we collect natural language test cases which match
+each moderation policy, we name this dataset HateModerate; second, using the
+test cases in HateModerate, we test the failure rates of state-of-the-art hate
+speech detection software, and we find that these models have high failure
+rates for certain policies; finally, since manual labeling is costly, we
+further proposed an automated approach to augument HateModerate by finetuning
+OpenAI's large language models to automatically match new examples to policies.
+The dataset and code of this work can be found on our anonymous website:
+\url{https://sites.google.com/view/content-moderation-project}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CommonsenseVIS: Visualizing and Understanding Commonsense Reasoning
+  Capabilities of Natural Language Models <span class="chip">IEEE VIS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12382v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12382v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingbo Wang, Renfei Huang, Zhihua Jin, Tianqing Fang, Huamin Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, large pretrained language models have achieved compelling
+performance on commonsense benchmarks. Nevertheless, it is unclear what
+commonsense knowledge the models learn and whether they solely exploit spurious
+patterns. Feature attributions are popular explainability techniques that
+identify important input concepts for model outputs. However, commonsense
+knowledge tends to be implicit and rarely explicitly presented in inputs. These
+methods cannot infer models' implicit reasoning over mentioned concepts. We
+present CommonsenseVIS, a visual explanatory system that utilizes external
+commonsense knowledge bases to contextualize model behavior for commonsense
+question-answering. Specifically, we extract relevant commonsense knowledge in
+inputs as references to align model behavior with human knowledge. Our system
+features multi-level visualization and interactive model probing and editing
+for different concepts and their underlying relations. Through a user study, we
+show that CommonsenseVIS helps NLP experts conduct a systematic and scalable
+visual analysis of models' relational reasoning over concepts in different
+situations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by IEEE VIS, 2023. To appear in IEEE
+  Transactions on Visualization and Computer Graphics (IEEE TVCG). 14 pages, 11
+  figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Learning in Large Language Models Learns Label Relationships
+  but Is Not Conventional Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannik Kossen, Tom Rainforth, Yarin Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of Large Language Models (LLMs) on downstream tasks often
+improves significantly when including examples of the input-label relationship
+in the context. However, there is currently no consensus about how this
+in-context learning (ICL) ability of LLMs works: for example, while Xie et al.
+(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)
+argue ICL does not even learn label relationships from in-context examples. In
+this paper, we study (1) how labels of in-context examples affect predictions,
+(2) how label relationships learned during pre-training interact with
+input-label examples provided in-context, and (3) how ICL aggregates label
+information across in-context examples. Our findings suggests LLMs usually
+incorporate information from in-context labels, but that pre-training and
+in-context label relationships are treated differently, and that the model does
+not consider all in-context information equally. Our results give insights into
+understanding and aligning LLM behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Emotional Nuances in Dialogue Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12371v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12371v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongxin Zhou, Fabien Ringeval, François Portet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic dialogue summarization is a well-established task that aims to
+identify the most important content from human conversations to create a short
+textual summary. Despite recent progress in the field, we show that most of the
+research has focused on summarizing the factual information, leaving aside the
+affective content, which can yet convey useful information to analyse, monitor,
+or support human interactions. In this paper, we propose and evaluate a set of
+measures $PEmo$, to quantify how much emotion is preserved in dialog summaries.
+Results show that, summarization models of the state-of-the-art do not preserve
+well the emotional content in the summaries. We also show that by reducing the
+training set to only emotional dialogues, the emotional content is better
+preserved in the generated summaries, while conserving the most salient factual
+information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early Prediction of Alzheimers Disease Leveraging Symptom Occurrences
+  from Longitudinal Electronic Health Records of US Military Veterans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12369v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12369v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rumeng Li, Xun Wang, Dan Berlowitz, Brian Silver, Wen Hu, Heather Keating, Raelene Goodwin, Weisong Liu, Honghuang Lin, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early prediction of Alzheimer's disease (AD) is crucial for timely
+intervention and treatment. This study aims to use machine learning approaches
+to analyze longitudinal electronic health records (EHRs) of patients with AD
+and identify signs and symptoms that can predict AD onset earlier. We used a
+case-control design with longitudinal EHRs from the U.S. Department of Veterans
+Affairs Veterans Health Administration (VHA) from 2004 to 2021. Cases were VHA
+patients with AD diagnosed after 1/1/2016 based on ICD-10-CM codes, matched 1:9
+with controls by age, sex and clinical utilization with replacement. We used a
+panel of AD-related keywords and their occurrences over time in a patient's
+longitudinal EHRs as predictors for AD prediction with four machine learning
+models. We performed subgroup analyses by age, sex, and race/ethnicity, and
+validated the model in a hold-out and "unseen" VHA stations group. Model
+discrimination, calibration, and other relevant metrics were reported for
+predictions up to ten years before ICD-based diagnosis. The study population
+included 16,701 cases and 39,097 matched controls. The average number of
+AD-related keywords (e.g., "concentration", "speaking") per year increased
+rapidly for cases as diagnosis approached, from around 10 to over 40, while
+remaining flat at 10 for controls. The best model achieved high discriminative
+accuracy (ROCAUC 0.997) for predictions using data from at least ten years
+before ICD-based diagnoses. The model was well-calibrated (Hosmer-Lemeshow
+goodness-of-fit p-value = 0.99) and consistent across subgroups of age, sex and
+race/ethnicity, except for patients younger than 65 (ROCAUC 0.746). Machine
+learning models using AD-related keywords identified from EHR notes can predict
+future AD diagnoses, suggesting its potential use for identifying AD risk using
+EHR notes, offering an affordable way for early screening on large population.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ X-CapsNet For Fake News Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Hadi Goldani, Reza Safabakhsh, Saeedeh Momtazi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  News consumption has significantly increased with the growing popularity and
+use of web-based forums and social media. This sets the stage for misinforming
+and confusing people. To help reduce the impact of misinformation on users'
+potential health-related decisions and other intents, it is desired to have
+machine learning models to detect and combat fake news automatically. This
+paper proposes a novel transformer-based model using Capsule neural
+Networks(CapsNet) called X-CapsNet. This model includes a CapsNet with dynamic
+routing algorithm paralyzed with a size-based classifier for detecting short
+and long fake news statements. We use two size-based classifiers, a Deep
+Convolutional Neural Network (DCNN) for detecting long fake news statements and
+a Multi-Layer Perceptron (MLP) for detecting short news statements. To resolve
+the problem of representing short news statements, we use indirect features of
+news created by concatenating the vector of news speaker profiles and a vector
+of polarity, sentiment, and counting words of news statements. For evaluating
+the proposed architecture, we use the Covid-19 and the Liar datasets. The
+results in terms of the F1-score for the Covid-19 dataset and accuracy for the
+Liar dataset show that models perform better than the state-of-the-art
+baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Milimili. Collecting Parallel Data via Crowdsourcing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Antonov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a methodology for gathering a parallel corpus through
+crowdsourcing, which is more cost-effective than hiring professional
+translators, albeit at the expense of quality. Additionally, we have made
+available experimental parallel data collected for Chechen-Russian and
+Fula-English language pairs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Automatic Boundary Detection for Human-AI Hybrid Essay in
+  Education 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijie Zeng, Lele Sha, Yuheng Li, Kaixun Yang, Dragan Gašević, Guanliang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-AI collaborative writing has been greatly facilitated with the help of
+modern large language models (LLM), e.g., ChatGPT. While admitting the
+convenience brought by technology advancement, educators also have concerns
+that students might leverage LLM to partially complete their writing assignment
+and pass off the human-AI hybrid text as their original work. Driven by such
+concerns, in this study, we investigated the automatic detection of Human-AI
+hybrid text in education, where we formalized the hybrid text detection as a
+boundary detection problem, i.e., identifying the transition points between
+human-written content and AI-generated content. We constructed a hybrid essay
+dataset by partially removing sentences from the original student-written
+essays and then instructing ChatGPT to fill in for the incomplete essays. Then
+we proposed a two-step detection approach where we (1) Separated AI-generated
+content from human-written content during the embedding learning process; and
+(2) Calculated the distances between every two adjacent prototypes (a prototype
+is the mean of a set of consecutive sentences from the hybrid text in the
+embedding space) and assumed that the boundaries exist between the two
+prototypes that have the furthest distance from each other. Through extensive
+experiments, we summarized the following main findings: (1) The proposed
+approach consistently outperformed the baseline methods across different
+experiment settings; (2) The embedding learning process (i.e., step 1) can
+significantly boost the performance of the proposed approach; (3) When
+detecting boundaries for single-boundary hybrid essays, the performance of the
+proposed approach could be enhanced by adopting a relatively large prototype
+size, leading to a $22$\% improvement (against the second-best baseline method)
+in the in-domain setting and an $18$\% improvement in the out-of-domain
+setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages including references, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>-based Joint Source Channel Coding for Textual Semantic
+  Communication 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shicong Liu, Zhen Gao, Gaojie Chen, Yu Su, Lu Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Space-Air-Ground-Sea integrated network calls for more robust and secure
+transmission techniques against jamming. In this paper, we propose a textual
+semantic transmission framework for robust transmission, which utilizes the
+advanced natural language processing techniques to model and encode sentences.
+Specifically, the textual sentences are firstly split into tokens using
+wordpiece algorithm, and are embedded to token vectors for semantic extraction
+by Transformer-based encoder. The encoded data are quantized to a fixed length
+binary sequence for transmission, where binary erasure, symmetric, and deletion
+channels are considered for transmission. The received binary sequences are
+further decoded by the transformer decoders into tokens used for sentence
+reconstruction. Our proposed approach leverages the power of neural networks
+and attention mechanism to provide reliable and efficient communication of
+textual data in challenging wireless environments, and simulation results on
+semantic similarity and bilingual evaluation understudy prove the superiority
+of the proposed model in semantic transmission.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 5 figures. Accepted by IEEE/CIC ICCC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A meta learning scheme for fast accent domain expansion in Mandarin
+  speech recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziwei Zhu, Changhao Shan, Bihong Zhang, Jian Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spoken languages show significant variation across mandarin and accent.
+Despite the high performance of mandarin automatic speech recognition (ASR),
+accent ASR is still a challenge task. In this paper, we introduce meta-learning
+techniques for fast accent domain expansion in mandarin speech recognition,
+which expands the field of accents without deteriorating the performance of
+mandarin ASR. Meta-learning or learn-to-learn can learn general relation in
+multi domains not only for over-fitting a specific domain. So we select
+meta-learning in the domain expansion task. This more essential learning will
+cause improved performance on accent domain extension tasks. We combine the
+methods of meta learning and freeze of model parameters, which makes the
+recognition performance more stable in different cases and the training faster
+about 20%. Our approach significantly outperforms other methods about 3%
+relatively in the accent domain expansion task. Compared to the baseline model,
+it improves relatively 37% under the condition that the mandarin test set
+remains unchanged. In addition, it also proved this method to be effective on a
+large amount of data with a relative performance improvement of 4% on the
+accent test set.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Integration of Speech Separation and Recognition with
+  <span class="highlight-title">Self-Supervised</span> Learning Representation <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12231v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12231v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoshiki Masuyama, Xuankai Chang, Wangyou Zhang, Samuele Cornell, Zhong-Qiu Wang, Nobutaka Ono, Yanmin Qian, Shinji Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural speech separation has made remarkable progress and its integration
+with automatic speech recognition (ASR) is an important direction towards
+realizing multi-speaker ASR. This work provides an insightful investigation of
+speech separation in reverberant and noisy-reverberant scenarios as an ASR
+front-end. In detail, we explore multi-channel separation methods, mask-based
+beamforming and complex spectral mapping, as well as the best features to use
+in the ASR back-end model. We employ the recent self-supervised learning
+representation (SSLR) as a feature and improve the recognition performance from
+the case with filterbank features. To further improve multi-speaker recognition
+performance, we present a carefully designed training strategy for integrating
+speech separation and recognition with SSLR. The proposed integration using
+TF-GridNet-based complex spectral mapping and WavLM-based SSLR achieves a 2.5%
+word error rate in reverberant WHAMR! test set, significantly outperforming an
+existing mask-based MVDR beamforming and filterbank integration (28.9%).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE WASPAA 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FATRER: Full-Attention Topic Regularizer for Accurate and Robust
+  Conversational Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuzhao Mao, Di Lu, Xiaojie Wang, Yang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper concentrates on the understanding of interlocutors' emotions
+evoked in conversational utterances. Previous studies in this literature mainly
+focus on more accurate emotional predictions, while ignoring model robustness
+when the local context is corrupted by adversarial attacks. To maintain
+robustness while ensuring accuracy, we propose an emotion recognizer augmented
+by a full-attention topic regularizer, which enables an emotion-related global
+view when modeling the local context in a conversation. A joint topic modeling
+strategy is introduced to implement regularization from both representation and
+loss perspectives. To avoid over-regularization, we drop the constraints on
+prior distributions that exist in traditional topic modeling and perform
+probabilistic approximations based entirely on attention alignment. Experiments
+show that our models obtain more favorable results than state-of-the-art
+models, and gain convincing robustness under three types of adversarial
+attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MenuCraft: Interactive Menu System Design with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04496v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04496v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Hossein Kargaran, Nafiseh Nikeghbal, Abbas Heydarnoori, Hinrich Schütze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Menu system design is a challenging task involving many design options and
+various human factors. For example, one crucial factor that designers need to
+consider is the semantic and systematic relation of menu commands. However,
+capturing these relations can be challenging due to limited available
+resources. With the advancement of neural language models, large language
+models can utilize their vast pre-existing knowledge in designing and refining
+menu systems. In this paper, we propose MenuCraft, an AI-assisted designer for
+menu design that enables collaboration between the designer and a dialogue
+system to design menus. MenuCraft offers an interactive language-based menu
+design tool that simplifies the menu design process and enables easy
+customization of design options. MenuCraft supports a variety of interactions
+through dialog that allows performing zero/few-shot learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Natural Language Processing for Long Texts: A <span class="highlight-title">Survey</span> of the
+  State-of-the-Art 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16259v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16259v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitrios Tsirmpas, Ioannis Gkionis, Ioannis Mademlis, Georgios Papadopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The adoption of Deep Neural Networks (DNNs) has greatly benefited Natural
+Language Processing (NLP) during the past decade. However, the demands of long
+document analysis are quite different from those of shorter texts, while the
+ever increasing size of documents uploaded on-line renders automated
+understanding of lengthy texts a critical issue. Relevant applications include
+automated Web mining, legal document review, medical records analysis,
+financial reports analysis, contract management, environmental impact
+assessment, news aggregation, etc. Despite the relatively recent development of
+efficient algorithms for analyzing long documents, practical tools in this
+field are currently flourishing. This article serves as an entry point into
+this dynamic domain and aims to achieve two objectives. Firstly, it provides an
+overview of the relevant neural building blocks, serving as a concise tutorial
+for the field. Secondly, it offers a brief examination of the current
+state-of-the-art in long document NLP, with a primary focus on two key tasks:
+document classification and document summarization. Sentiment analysis for long
+texts is also covered, since it is typically treated as a particular case of
+document classification. Consequently, this article presents an introductory
+exploration of document-level analysis, addressing the primary challenges,
+concerns, and existing solutions. Finally, the article presents publicly
+available annotated datasets that can facilitate further research in this area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>58 pages, 11 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Log-linear Guardedness and its Implications <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.10012v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.10012v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shauli Ravfogel, Yoav Goldberg, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods for erasing human-interpretable concepts from neural representations
+that assume linearity have been found to be tractable and useful. However, the
+impact of this removal on the behavior of downstream classifiers trained on the
+modified representations is not fully understood. In this work, we formally
+define the notion of log-linear guardedness as the inability of an adversary to
+predict the concept directly from the representation, and study its
+implications. We show that, in the binary case, under certain assumptions, a
+downstream log-linear model cannot recover the erased concept. However, we
+demonstrate that a multiclass log-linear model \emph{can} be constructed that
+indirectly recovers the concept in some cases, pointing to the inherent
+limitations of log-linear guardedness as a downstream bias mitigation
+technique. These findings shed light on the theoretical limitations of linear
+erasure methods and highlight the need for further research on the connections
+between intrinsic and extrinsic bias in neural models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a long paper in ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comparing Apples to Apples: Generating Aspect-Aware Comparative
+  Sentences from User <span class="highlight-title">Review</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03691v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03691v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jessica Echterhoff, An Yan, Julian McAuley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is time-consuming to find the best product among many similar
+alternatives. Comparative sentences can help to contrast one item from others
+in a way that highlights important features of an item that stand out. Given
+reviews of one or multiple items and relevant item features, we generate
+comparative review sentences to aid users to find the best fit. Specifically,
+our model consists of three successive components in a transformer: (i) an item
+encoding module to encode an item for comparison, (ii) a comparison generation
+module that generates comparative sentences in an autoregressive manner, (iii)
+a novel decoding method for user personalization. We show that our pipeline
+generates fluent and diverse comparative sentences. We run experiments on the
+relevance and fidelity of our generated sentences in a human evaluation study
+and find that our algorithm creates comparative review sentences that are
+relevant and truthful.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Investigating the Factual Knowledge Boundary of Large Language Models
+  with Retrieval Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11019v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11019v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiyang Ren, Yuhao Wang, Yingqi Qu, Wayne Xin Zhao, Jing Liu, Hao Tian, Hua Wu, Ji-Rong Wen, Haifeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require
+a substantial amount of factual knowledge and often rely on external
+information for assistance. Recently, large language models (LLMs) (e.g.,
+ChatGPT), have demonstrated impressive prowess in solving a wide range of tasks
+with world knowledge, including knowledge-intensive tasks. However, it remains
+unclear how well LLMs are able to perceive their factual knowledge boundaries,
+particularly how they behave when incorporating retrieval augmentation. In this
+study, we present an initial analysis of the factual knowledge boundaries of
+LLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially,
+we focus on three primary research questions and analyze them by examining QA
+performance, priori judgement and posteriori judgement of LLMs. We show
+evidence that LLMs possess unwavering confidence in their capabilities to
+respond to questions and the accuracy of their responses. Furthermore,
+retrieval augmentation proves to be an effective approach in enhancing LLMs'
+awareness of knowledge boundaries, thereby improving their judgemental
+abilities. Additionally, we also find that LLMs have a propensity to rely on
+the provided retrieval results when formulating answers, while the quality of
+these results significantly impacts their reliance. The code to reproduce this
+work is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LAno<span class="highlight-title">BERT</span>: System Log Anomaly Detection based on <span class="highlight-title">BERT</span> Masked Language
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.09564v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.09564v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukyung Lee, Jina Kim, Pilsung Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The system log generated in a computer system refers to large-scale data that
+are collected simultaneously and used as the basic data for determining errors,
+intrusion and abnormal behaviors. The aim of system log anomaly detection is to
+promptly identify anomalies while minimizing human intervention, which is a
+critical problem in the industry. Previous studies performed anomaly detection
+through algorithms after converting various forms of log data into a
+standardized template using a parser. Particularly, a template corresponding to
+a specific event should be defined in advance for all the log data using which
+the information within the log key may get lost. In this study, we propose
+LAnoBERT, a parser free system log anomaly detection method that uses the BERT
+model, exhibiting excellent natural language processing performance. The
+proposed method, LAnoBERT, learns the model through masked language modeling,
+which is a BERT-based pre-training method, and proceeds with unsupervised
+learning-based anomaly detection using the masked language modeling loss
+function per log key during the test process. In addition, we also propose an
+efficient inference process to establish a practically applicable pipeline to
+the actual system. Experiments on three well-known log datasets, i.e., HDFS,
+BGL, and Thunderbird, show that not only did LAnoBERT yield a higher anomaly
+detection performance compared to unsupervised learning-based benchmark models,
+but also it resulted in a comparable performance with supervised learning-based
+benchmark models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ELVIS: Empowering Locality of Vision Language <span class="highlight-title">Pre-train</span>ing with
+  Intra-modal Similarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05303v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05303v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sumin Seo, JaeWoong Shin, Jaewoo Kang, Tae Soo Kim, Thijs Kooi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has shown great potential in assisting radiologists in reading
+chest X-ray (CXR) images, but its need for expensive annotations for improving
+performance prevents widespread clinical application. Visual language
+pre-training (VLP) can alleviate the burden and cost of annotation by
+leveraging routinely generated reports for radiographs, which exist in large
+quantities as well as in paired form (image-text pairs). Additionally,
+extensions to localization-aware VLPs are being proposed to address the needs
+for accurate localization of abnormalities for computer-aided diagnosis (CAD)
+in CXR. However, we find that the formulation proposed by locality-aware VLP
+literature actually leads to a loss in spatial relationships required for
+downstream localization tasks. Therefore, we propose Empowering Locality of VLP
+with Intra-modal Similarity, ELVIS, a VLP aware of intra-modal locality, to
+better preserve the locality within radiographs or reports, which enhances the
+ability to comprehend location references in text reports. Our locality-aware
+VLP method significantly outperforms state-of-the art baselines in multiple
+segmentation tasks and the MS-CXR phrase grounding task. Qualitatively, we show
+that ELVIS focuses well on regions of interest described in the report text
+compared to prior approaches, allowing for enhanced interpretability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DSTEA: Improving Dialogue State Tracking via Entity Adaptive
+  <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.03858v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.03858v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukyung Lee, Takyoung Kim, Hoonsang Yoon, Pilsung Kang, Junseong Bang, Misuk Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dialogue State Tracking (DST) is critical for comprehensively interpreting
+user and system utterances, thereby forming the cornerstone of efficient
+dialogue systems. Despite past research efforts focused on enhancing DST
+performance through alterations to the model structure or integrating
+additional features like graph relations, they often require additional
+pre-training with external dialogue corpora. In this study, we propose DSTEA,
+improving Dialogue State Tracking via Entity Adaptive pre-training, which can
+enhance the encoder through by intensively training key entities in dialogue
+utterances. DSTEA identifies these pivotal entities from input dialogues
+utilizing four different methods: ontology information, named-entity
+recognition, the spaCy, and the flair library. Subsequently, it employs
+selective knowledge masking to train the model effectively. Remarkably, DSTEA
+only requires pre-training without the direct infusion of extra knowledge into
+the DST model. This approach resulted in substantial performance improvements
+of four robust DST models on MultiWOZ 2.0, 2.1, and 2.2, with joint goal
+accuracy witnessing an increase of up to 2.69% (from 52.41% to 55.10%). Further
+validation of DSTEA's efficacy was provided through comparative experiments
+considering various entity types and different entity adaptive pre-training
+configurations such as masking strategy and masking rate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sentiment<span class="highlight-title">GPT</span>: Exploiting <span class="highlight-title">GPT</span> for Advanced Sentiment Analysis and its
+  Departure from Current Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10234v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10234v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kiana Kheiri, Hamid Karimi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a thorough examination of various Generative Pretrained
+Transformer (GPT) methodologies in sentiment analysis, specifically in the
+context of Task 4 on the SemEval 2017 dataset. Three primary strategies are
+employed: 1) prompt engineering using the advanced GPT-3.5 Turbo, 2)
+fine-tuning GPT models, and 3) an inventive approach to embedding
+classification. The research yields detailed comparative insights among these
+strategies and individual GPT models, revealing their unique strengths and
+potential limitations. Additionally, the study compares these GPT-based
+methodologies with other current, high-performing models previously used with
+the same dataset. The results illustrate the significant superiority of the GPT
+approaches in terms of predictive performance, more than 22\% in F1-score
+compared to the state-of-the-art. Further, the paper sheds light on common
+challenges in sentiment analysis tasks, such as understanding context and
+detecting sarcasm. It underscores the enhanced capabilities of the GPT models
+to effectively handle these complexities. Taken together, these findings
+highlight the promising potential of GPT models in sentiment analysis, setting
+the stage for future research in this field. The code can be found at
+https://github.com/DSAatUSU/SentimentGPT
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Launchpad<span class="highlight-title">GPT</span>: Language Model as Music Visualization Designer on
+  Launchpad 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04827v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04827v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siting Xu, Yunlong Tang, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Launchpad is a musical instrument that allows users to create and perform
+music by pressing illuminated buttons. To assist and inspire the design of the
+Launchpad light effect, and provide a more accessible approach for beginners to
+create music visualization with this instrument, we proposed the LaunchpadGPT
+model to generate music visualization designs on Launchpad automatically. Based
+on the language model with excellent generation ability, our proposed
+LaunchpadGPT takes an audio piece of music as input and outputs the lighting
+effects of Launchpad-playing in the form of a video (Launchpad-playing video).
+We collect Launchpad-playing videos and process them to obtain music and
+corresponding video frame of Launchpad-playing as prompt-completion pairs, to
+train the language model. The experiment result shows the proposed method can
+create better music visualization than random generation methods and hold the
+potential for a broader range of music visualization applications. Our code is
+available at https://github.com/yunlong10/LaunchpadGPT/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Computer Music Conference (ICMC) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MGR: Multi-generator Based Rationalization <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04492v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04492v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Liu, Haozhao Wang, Jun Wang, Ruixuan Li, Xinyang Li, Yuankai Zhang, Yang Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rationalization is to employ a generator and a predictor to construct a
+self-explaining NLP model in which the generator selects a subset of
+human-intelligible pieces of the input text to the following predictor.
+However, rationalization suffers from two key challenges, i.e., spurious
+correlation and degeneration, where the predictor overfits the spurious or
+meaningless pieces solely selected by the not-yet well-trained generator and in
+turn deteriorates the generator. Although many studies have been proposed to
+address the two challenges, they are usually designed separately and do not
+take both of them into account. In this paper, we propose a simple yet
+effective method named MGR to simultaneously solve the two problems. The key
+idea of MGR is to employ multiple generators such that the occurrence stability
+of real pieces is improved and more meaningful pieces are delivered to the
+predictor. Empirically, we show that MGR improves the F1 score by up to 20.9%
+as compared to state-of-the-art methods. Codes are available at
+https://github.com/jugechengzi/Rationalization-MGR .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, oral presentation. Fixed some typos and clarified some
+  implementation details. arXiv admin note: text overlap with arXiv:2209.08285</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Syllable Discovery and Cross-Lingual Generalization in a Visually
+  Grounded, <span class="highlight-title">Self-Supervised</span> Speech Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11435v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11435v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Puyuan Peng, Shang-Wen Li, Okko Räsänen, Abdelrahman Mohamed, David Harwath
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we show that representations capturing syllabic units emerge
+when training a self-supervised speech model with a visually-grounded training
+objective. We demonstrate that a nearly identical model architecture (HuBERT)
+trained with a masked language modeling loss does not exhibit this same
+ability, suggesting that the visual grounding objective is responsible for the
+emergence of this phenomenon. We propose the use of a minimum cut algorithm to
+automatically predict syllable boundaries in speech, followed by a 2-stage
+clustering method to group identical syllables together. We show that our model
+not only outperforms a state-of-the-art syllabic segmentation method on the
+language it was trained on (English), but also generalizes in a zero-shot
+fashion to Estonian. Finally, we show that the same model is capable of
+zero-shot generalization for a word segmentation task on 4 other languages from
+the Zerospeech Challenge, in some cases beating the previous state-of-the-art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Interspeech 2023. Code & Model:
+  https://github.com/jasonppy/syllable-discovery</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Detect<span class="highlight-title">GPT</span>: Zero-Shot Machine-Generated Text Detection using Probability
+  Curvature <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11305v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11305v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Mitchell, Yoonho Lee, Alexander Khazatsky, Christopher D. Manning, Chelsea Finn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing fluency and widespread usage of large language models (LLMs)
+highlight the desirability of corresponding tools aiding detection of
+LLM-generated text. In this paper, we identify a property of the structure of
+an LLM's probability function that is useful for such detection. Specifically,
+we demonstrate that text sampled from an LLM tends to occupy negative curvature
+regions of the model's log probability function. Leveraging this observation,
+we then define a new curvature-based criterion for judging if a passage is
+generated from a given LLM. This approach, which we call DetectGPT, does not
+require training a separate classifier, collecting a dataset of real or
+generated passages, or explicitly watermarking generated text. It uses only log
+probabilities computed by the model of interest and random perturbations of the
+passage from another generic pre-trained language model (e.g., T5). We find
+DetectGPT is more discriminative than existing zero-shot methods for model
+sample detection, notably improving detection of fake news articles generated
+by 20B parameter GPT-NeoX from 0.81 AUROC for the strongest zero-shot baseline
+to 0.95 AUROC for DetectGPT. See https://ericmitchell.ai/detectgpt for code,
+data, and other project information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">46</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProtoFL: Unsupervised Federated Learning via Prototypical Distillation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansol Kim, Youngjun Kwak, Minyoung Jung, Jinho Shin, Youngsung Kim, Changick Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a promising approach for enhancing data privacy
+preservation, particularly for authentication systems. However, limited round
+communications, scarce representation, and scalability pose significant
+challenges to its deployment, hindering its full potential. In this paper, we
+propose 'ProtoFL', Prototypical Representation Distillation based unsupervised
+Federated Learning to enhance the representation power of a global model and
+reduce round communication costs. Additionally, we introduce a local one-class
+classifier based on normalizing flows to improve performance with limited data.
+Our study represents the first investigation of using FL to improve one-class
+classification performance. We conduct extensive experiments on five widely
+used benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and
+Keystroke-Dynamics, to demonstrate the superior performance of our proposed
+framework over previous methods in the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed
+  equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EnTri: Ensemble Learning with Tri-level Representations for Explainable
+  Scene Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amirhossein Aminimehr, Amirali Molaei, Erik Cambria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene recognition based on deep-learning has made significant progress, but
+there are still limitations in its performance due to challenges posed by
+inter-class similarities and intra-class dissimilarities. Furthermore, prior
+research has primarily focused on improving classification accuracy, yet it has
+given less attention to achieving interpretable, precise scene classification.
+Therefore, we are motivated to propose EnTri, an ensemble scene recognition
+framework that employs ensemble learning using a hierarchy of visual features.
+EnTri represents features at three distinct levels of detail: pixel-level,
+semantic segmentation-level, and object class and frequency level. By
+incorporating distinct feature encoding schemes of differing complexity and
+leveraging ensemble strategies, our approach aims to improve classification
+accuracy while enhancing transparency and interpretability via visual and
+textual explanations. To achieve interpretability, we devised an extension
+algorithm that generates both visual and textual explanations highlighting
+various properties of a given scene that contribute to the final prediction of
+its category. This includes information about objects, statistics, spatial
+layout, and textural details. Through experiments on benchmark scene
+classification datasets, EnTri has demonstrated superiority in terms of
+recognition accuracy, achieving competitive performance compared to
+state-of-the-art approaches, with an accuracy of 87.69%, 75.56%, and 99.17% on
+the MIT67, SUN397, and UIUC8 datasets, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Pattern Recognition journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SwIPE: Efficient and Robust Medical Image Segmentation with Implicit
+  Patch Embeddings <span class="chip">MICCAI'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yejia Zhang, Pengfei Gu, Nishchal Sapkota, Danny Z. Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern medical image segmentation methods primarily use discrete
+representations in the form of rasterized masks to learn features and generate
+predictions. Although effective, this paradigm is spatially inflexible, scales
+poorly to higher-resolution images, and lacks direct understanding of object
+shapes. To address these limitations, some recent works utilized implicit
+neural representations (INRs) to learn continuous representations for
+segmentation. However, these methods often directly adopted components designed
+for 3D shape reconstruction. More importantly, these formulations were also
+constrained to either point-based or global contexts, lacking contextual
+understanding or local fine-grained details, respectively--both critical for
+accurate segmentation. To remedy this, we propose a novel approach, SwIPE
+(Segmentation with Implicit Patch Embeddings), that leverages the advantages of
+INRs and predicts shapes at the patch level--rather than at the point level or
+image level--to enable both accurate local boundary delineation and global
+shape coherence. Extensive evaluations on two tasks (2D polyp segmentation and
+3D abdominal organ segmentation) show that SwIPE significantly improves over
+recent implicit approaches and outperforms state-of-the-art discrete methods
+with over 10x fewer parameters. Our method also demonstrates superior data
+efficiency and improved robustness to data shifts across image resolutions and
+datasets. Code is available on Github.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to 2023 International Conference on Medical Image Computing
+  and Computer Assisted Intervention (MICCAI'23)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Augmented Box Replay: Overcoming Foreground Shift for Incremental Object
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12427v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12427v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liu Yuyang, Cong Yang, Goswami Dipam, Liu Xialei, Joost van de Weijer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In incremental learning, replaying stored samples from previous tasks
+together with current task samples is one of the most efficient approaches to
+address catastrophic forgetting. However, unlike incremental classification,
+image replay has not been successfully applied to incremental object detection
+(IOD). In this paper, we identify the overlooked problem of foreground shift as
+the main reason for this. Foreground shift only occurs when replaying images of
+previous tasks and refers to the fact that their background might contain
+foreground objects of the current task. To overcome this problem, a novel and
+efficient Augmented Box Replay (ABR) method is developed that only stores and
+replays foreground objects and thereby circumvents the foreground shift
+problem. In addition, we propose an innovative Attentive RoI Distillation loss
+that uses spatial attention from region-of-interest (RoI) features to constrain
+current model to focus on the most important information from old model. ABR
+significantly reduces forgetting of previous classes while maintaining high
+plasticity in current classes. Moreover, it considerably reduces the storage
+requirements when compared to standard image replay. Comprehensive experiments
+on Pascal-VOC and COCO datasets support the state-of-the-art performance of our
+model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TransNet: Transparent Object Manipulation Through Category-Level Pose
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12400v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12400v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huijie Zhang, Anthony Opipari, Xiaotong Chen, Jiyue Zhu, Zeren Yu, Odest Chadwicke Jenkins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transparent objects present multiple distinct challenges to visual perception
+systems. First, their lack of distinguishing visual features makes transparent
+objects harder to detect and localize than opaque objects. Even humans find
+certain transparent surfaces with little specular reflection or refraction,
+like glass doors, difficult to perceive. A second challenge is that depth
+sensors typically used for opaque object perception cannot obtain accurate
+depth measurements on transparent surfaces due to their unique reflective
+properties. Stemming from these challenges, we observe that transparent object
+instances within the same category, such as cups, look more similar to each
+other than to ordinary opaque objects of that same category. Given this
+observation, the present paper explores the possibility of category-level
+transparent object pose estimation rather than instance-level pose estimation.
+We propose \textit{\textbf{TransNet}}, a two-stage pipeline that estimates
+category-level transparent object pose using localized depth completion and
+surface normal estimation. TransNet is evaluated in terms of pose estimation
+accuracy on a large-scale transparent object dataset and compared to a
+state-of-the-art category-level pose estimation approach. Results from this
+comparison demonstrate that TransNet achieves improved pose estimation accuracy
+on transparent objects. Moreover, we use TransNet to build an autonomous
+transparent object manipulation system for robotic pick-and-place and pouring
+tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterative Robust Visual Grounding with Masked Reference based
+  Centerpoint Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12392v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12392v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Menghao Li, Chunlei Wang, Wenquan Feng, Shuchang Lyu, Guangliang Cheng, Xiangtai Li, Binghao Liu, Qi Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Grounding (VG) aims at localizing target objects from an image based
+on given expressions and has made significant progress with the development of
+detection and vision transformer. However, existing VG methods tend to generate
+false-alarm objects when presented with inaccurate or irrelevant descriptions,
+which commonly occur in practical applications. Moreover, existing methods fail
+to capture fine-grained features, accurate localization, and sufficient context
+comprehension from the whole image and textual descriptions. To address both
+issues, we propose an Iterative Robust Visual Grounding (IR-VG) framework with
+Masked Reference based Centerpoint Supervision (MRCS). The framework introduces
+iterative multi-level vision-language fusion (IMVF) for better alignment. We
+use MRCS to ahieve more accurate localization with point-wised feature
+supervision. Then, to improve the robustness of VG, we also present a
+multi-stage false-alarm sensitive decoder (MFSD) to prevent the generation of
+false-alarm objects when presented with inaccurate expressions. The proposed
+framework is evaluated on five regular VG datasets and two newly constructed
+robust VG datasets. Extensive experiments demonstrate that IR-VG achieves new
+state-of-the-art (SOTA) results, with improvements of 25\% and 10\% compared to
+existing SOTA approaches on the two newly proposed robust VG datasets.
+Moreover, the proposed framework is also verified effective on five regular VG
+datasets. Codes and models will be publicly at
+https://github.com/cv516Buaa/IR-VG.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ComPtr: Towards Diverse Bi-source Dense Prediction Tasks via A Simple
+  yet General Complementary <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12349v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12349v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youwei Pang, Xiaoqi Zhao, Lihe Zhang, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) has advanced the field of dense prediction, while
+gradually dissolving the inherent barriers between different tasks. However,
+most existing works focus on designing architectures and constructing visual
+cues only for the specific task, which ignores the potential uniformity
+introduced by the DL paradigm. In this paper, we attempt to construct a novel
+\underline{ComP}lementary \underline{tr}ansformer, \textbf{ComPtr}, for diverse
+bi-source dense prediction tasks. Specifically, unlike existing methods that
+over-specialize in a single task or a subset of tasks, ComPtr starts from the
+more general concept of bi-source dense prediction. Based on the basic
+dependence on information complementarity, we propose consistency enhancement
+and difference awareness components with which ComPtr can evacuate and collect
+important visual semantic cues from different image sources for diverse tasks,
+respectively. ComPtr treats different inputs equally and builds an efficient
+dense interaction model in the form of sequence-to-sequence on top of the
+transformer. This task-generic design provides a smooth foundation for
+constructing the unified model that can simultaneously deal with various
+bi-source information. In extensive experiments across several representative
+vision tasks, i.e. remote sensing change detection, RGB-T crowd counting,
+RGB-D/T salient object detection, and RGB-D semantic segmentation, the proposed
+method consistently obtains favorable performance. The code will be available
+at \url{https://github.com/lartpang/ComPtr}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ResShift: Efficient Diffusion Model for Image Super-resolution by
+  Residual Shifting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12348v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12348v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongsheng Yue, Jianyi Wang, Chen Change Loy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based image super-resolution (SR) methods are mainly limited by the
+low inference speed due to the requirements of hundreds or even thousands of
+sampling steps. Existing acceleration sampling techniques inevitably sacrifice
+performance to some extent, leading to over-blurry SR results. To address this
+issue, we propose a novel and efficient diffusion model for SR that
+significantly reduces the number of diffusion steps, thereby eliminating the
+need for post-acceleration during inference and its associated performance
+deterioration. Our method constructs a Markov chain that transfers between the
+high-resolution image and the low-resolution image by shifting the residual
+between them, substantially improving the transition efficiency. Additionally,
+an elaborate noise schedule is developed to flexibly control the shifting speed
+and the noise strength during the diffusion process. Extensive experiments
+demonstrate that the proposed method obtains superior or at least comparable
+performance to current state-of-the-art methods on both synthetic and
+real-world datasets, even only with 15 sampling steps. Our code and model are
+available at https://github.com/zsyOAOA/ResShift.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Right for the Wrong Reason: Can Interpretable ML Techniques Detect
+  Spurious Correlations? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susu Sun, Lisa M. Koch, Christian F. Baumgartner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While deep neural network models offer unmatched classification performance,
+they are prone to learning spurious correlations in the data. Such dependencies
+on confounding information can be difficult to detect using performance metrics
+if the test data comes from the same distribution as the training data.
+Interpretable ML methods such as post-hoc explanations or inherently
+interpretable classifiers promise to identify faulty model reasoning. However,
+there is mixed evidence whether many of these techniques are actually able to
+do so. In this paper, we propose a rigorous evaluation strategy to assess an
+explanation technique's ability to correctly identify spurious correlations.
+Using this strategy, we evaluate five post-hoc explanation techniques and one
+inherently interpretable method for their ability to detect three types of
+artificially added confounders in a chest x-ray diagnosis task. We find that
+the post-hoc technique SHAP, as well as the inherently interpretable Attri-Net
+provide the best performance and can be used to reliably identify faulty model
+behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generic and Controllable Attacks Against Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12342v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12342v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guopeng Li, Yue Xu, Jian Ding, Gui-Song Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing adversarial attacks against Object Detectors (ODs) suffer from two
+inherent limitations. Firstly, ODs have complicated meta-structure designs,
+hence most advanced attacks for ODs concentrate on attacking specific
+detector-intrinsic structures, which makes it hard for them to work on other
+detectors and motivates us to design a generic attack against ODs. Secondly,
+most works against ODs make Adversarial Examples (AEs) by generalizing
+image-level attacks from classification to detection, which brings redundant
+computations and perturbations in semantically meaningless areas (e.g.,
+backgrounds) and leads to an emergency for seeking controllable attacks for
+ODs. To this end, we propose a generic white-box attack, LGP (local
+perturbations with adaptively global attacks), to blind mainstream object
+detectors with controllable perturbations. For a detector-agnostic attack, LGP
+tracks high-quality proposals and optimizes three heterogeneous losses
+simultaneously. In this way, we can fool the crucial components of ODs with a
+part of their outputs without the limitations of specific structures. Regarding
+controllability, we establish an object-wise constraint that exploits
+foreground-background separation adaptively to induce the attachment of
+perturbations to foregrounds. Experimentally, the proposed LGP successfully
+attacked sixteen state-of-the-art object detectors on MS-COCO and DOTA
+datasets, with promising imperceptibility and transferability obtained. Codes
+are publicly released in https://github.com/liguopeng0923/LGP.git
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rapid detection of soil carbonates by means of NIR spectroscopy, deep
+  learning methods and phase quantification by powder Xray diffraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lykourgos Chiniadis, Petros Tamvakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Soil NIR spectral absorbance/reflectance libraries are utilized towards
+improving agricultural production and analysis of soil properties which are key
+prerequisite for agroecological balance and environmental sustainability.
+Carbonates in particular, represent a soil property which is mostly affected
+even by mild, let alone extreme, changes of environmental conditions during
+climate change. In this study we propose a rapid and efficient way to predict
+carbonates content in soil by means of FT NIR reflectance spectroscopy and by
+use of deep learning methods. We exploited multiple machine learning methods,
+such as: 1) a MLP Regressor and 2) a CNN and compare their performance with
+other traditional ML algorithms such as PLSR, Cubist and SVM on the combined
+dataset of two NIR spectral libraries: KSSL (USDA), a dataset of soil samples
+reflectance spectra collected nationwide, and LUCAS TopSoil (European Soil
+Library) which contains soil sample absorbance spectra from all over the
+European Union, and use them to predict carbonate content on never before seen
+soil samples. Soil samples in KSSL and in TopSoil spectral libraries were
+acquired in the spectral region of visNIR, however in this study, only the NIR
+spectral region was utilized. Quantification of carbonates by means of Xray
+Diffraction is in good agreement with the volumetric method and the MLP
+prediction. Our work contributes to rapid carbonates content prediction in soil
+samples in cases where: 1) no volumetric method is available and 2) only NIR
+spectra absorbance data are available. Up till now and to the best of our
+knowledge, there exists no other study, that presents a prediction model
+trained on such an extensive dataset with such promising results on unseen
+data, undoubtedly supporting the notion that deep learning models present
+excellent prediction tools for soil carbonates content.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Navigational Visual Representations with Semantic Map
+  Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yicong Hong, Yang Zhou, Ruiyi Zhang, Franck Dernoncourt, Trung Bui, Stephen Gould, Hao Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Being able to perceive the semantics and the spatial structure of the
+environment is essential for visual navigation of a household robot. However,
+most existing works only employ visual backbones pre-trained either with
+independent images for classification or with self-supervised learning methods
+to adapt to the indoor navigation domain, neglecting the spatial relationships
+that are essential to the learning of navigation. Inspired by the behavior that
+humans naturally build semantically and spatially meaningful cognitive maps in
+their brains during navigation, in this paper, we propose a novel
+navigational-specific visual representation learning method by contrasting the
+agent's egocentric views and semantic maps (Ego$^2$-Map). We apply the visual
+transformer as the backbone encoder and train the model with data collected
+from the large-scale Habitat-Matterport3D environments. Ego$^2$-Map learning
+transfers the compact and rich information from a map, such as objects,
+structure and transition, to the agent's egocentric representations for
+navigation. Experiments show that agents using our learned representations on
+object-goal navigation outperform recent visual pre-training methods. Moreover,
+our representations significantly improve vision-and-language navigation in
+continuous environments for both high-level and low-level action spaces,
+achieving new state-of-the-art results of 47% SR and 41% SPL on the test
+server.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ES2Net: An Efficient Spectral-Spatial Network for Hyperspectral Image
+  Change Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12327v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12327v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingren Yao, Yuan Zhou, Wei Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral image change detection (HSI-CD) aims to identify the
+differences in bitemporal HSIs. To mitigate spectral redundancy and improve the
+discriminativeness of changing features, some methods introduced band selection
+technology to select bands conducive for CD. However, these methods are limited
+by the inability to end-to-end training with the deep learning-based feature
+extractor and lack considering the complex nonlinear relationship among bands.
+In this paper, we propose an end-to-end efficient spectral-spatial change
+detection network (ES2Net) to address these issues. Specifically, we devised a
+learnable band selection module to automatically select bands conducive to CD.
+It can be jointly optimized with a feature extraction network and capture the
+complex nonlinear relationships among bands. Moreover, considering the large
+spatial feature distribution differences among different bands, we design the
+cluster-wise spatial attention mechanism that assigns a spatial attention
+factor to each individual band to individually improve the feature
+discriminativeness for each band. Experiments on three widely used HSI-CD
+datasets demonstrate the effectiveness and superiority of this method compared
+with other state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Development of pericardial fat count images using a combination of three
+  different deep-learning models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takaaki Matsunaga, Atsushi Kono, Hidetoshi Matsuo, Kaoru Kitagawab, Mizuho Nishio, Hiromi Hashimura, Yu Izawa, Takayoshi Toba, Kazuki Ishikawab, Akie Katsuki, Kazuyuki Ohmura, Takamichi Murakami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rationale and Objectives: Pericardial fat (PF), the thoracic visceral fat
+surrounding the heart, promotes the development of coronary artery disease by
+inducing inflammation of the coronary arteries. For evaluating PF, this study
+aimed to generate pericardial fat count images (PFCIs) from chest radiographs
+(CXRs) using a dedicated deep-learning model.
+  Materials and Methods: The data of 269 consecutive patients who underwent
+coronary computed tomography (CT) were reviewed. Patients with metal implants,
+pleural effusion, history of thoracic surgery, or that of malignancy were
+excluded. Thus, the data of 191 patients were used. PFCIs were generated from
+the projection of three-dimensional CT images, where fat accumulation was
+represented by a high pixel value. Three different deep-learning models,
+including CycleGAN, were combined in the proposed method to generate PFCIs from
+CXRs. A single CycleGAN-based model was used to generate PFCIs from CXRs for
+comparison with the proposed method. To evaluate the image quality of the
+generated PFCIs, structural similarity index measure (SSIM), mean squared error
+(MSE), and mean absolute error (MAE) of (i) the PFCI generated using the
+proposed method and (ii) the PFCI generated using the single model were
+compared.
+  Results: The mean SSIM, MSE, and MAE were as follows: 0.856, 0.0128, and
+0.0357, respectively, for the proposed model; and 0.762, 0.0198, and 0.0504,
+respectively, for the single CycleGAN-based model.
+  Conclusion: PFCIs generated from CXRs with the proposed model showed better
+performance than those with the single model. PFCI evaluation without CT may be
+possible with the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Building Extraction from Remote Sensing Images via an Uncertainty-Aware
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei He, Jiepan Li, Weinan Cao, Liangpei Zhang, Hongyan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building extraction aims to segment building pixels from remote sensing
+images and plays an essential role in many applications, such as city planning
+and urban dynamic monitoring. Over the past few years, deep learning methods
+with encoder-decoder architectures have achieved remarkable performance due to
+their powerful feature representation capability. Nevertheless, due to the
+varying scales and styles of buildings, conventional deep learning models
+always suffer from uncertain predictions and cannot accurately distinguish the
+complete footprints of the building from the complex distribution of ground
+objects, leading to a large degree of omission and commission. In this paper,
+we realize the importance of uncertain prediction and propose a novel and
+straightforward Uncertainty-Aware Network (UANet) to alleviate this problem. To
+verify the performance of our proposed UANet, we conduct extensive experiments
+on three public building datasets, including the WHU building dataset, the
+Massachusetts building dataset, and the Inria aerial image dataset. Results
+demonstrate that the proposed UANet outperforms other state-of-the-art
+algorithms by a large margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RANSAC-NN: Unsupervised Image Outlier Detection using RANSAC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12301v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12301v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen-Han Tsai, Yu-Shao Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image outlier detection (OD) is crucial for ensuring the quality and accuracy
+of image datasets used in computer vision tasks. The majority of OD algorithms,
+however, have not been targeted toward image data. Consequently, the results of
+applying such algorithms to images are often suboptimal. In this work, we
+propose RANSAC-NN, a novel unsupervised OD algorithm specifically designed for
+images. By comparing images in a RANSAC-based approach, our algorithm
+automatically predicts the outlier score of each image without additional
+training or label information. We evaluate RANSAC-NN against state-of-the-art
+OD algorithms on 15 diverse datasets. Without any hyperparameter tuning,
+RANSAC-NN consistently performs favorably in contrast to other algorithms in
+almost every dataset category. Furthermore, we provide a detailed analysis to
+understand each RANSAC-NN component, and we demonstrate its potential
+applications in image mislabeled detection. Code for RANSAC-NN is provided at
+https://github.com/mxtsai/ransac-nn
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hybrid-CSR: Coupling Explicit and Implicit Shape Representation for
+  Cortical Surface Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12299v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12299v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanlin Sun, Thanh-Tung Le, Chenyu You, Hao Tang, Kun Han, Haoyu Ma, Deying Kong, Xiangyi Yan, Xiaohui Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Hybrid-CSR, a geometric deep-learning model that combines explicit
+and implicit shape representations for cortical surface reconstruction.
+Specifically, Hybrid-CSR begins with explicit deformations of template meshes
+to obtain coarsely reconstructed cortical surfaces, based on which the oriented
+point clouds are estimated for the subsequent differentiable poisson surface
+reconstruction. By doing so, our method unifies explicit (oriented point
+clouds) and implicit (indicator function) cortical surface reconstruction.
+Compared to explicit representation-based methods, our hybrid approach is more
+friendly to capture detailed structures, and when compared with implicit
+representation-based methods, our method can be topology aware because of
+end-to-end training with a mesh-based deformation module. In order to address
+topology defects, we propose a new topology correction pipeline that relies on
+optimization-based diffeomorphic surface registration. Experimental results on
+three brain datasets show that our approach surpasses existing implicit and
+explicit cortical surface reconstruction methods in numeric metrics in terms of
+accuracy, regularity, and consistency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simultaneous temperature estimation and nonuniformity correction from
+  multiple frames 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12297v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12297v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navot Oz, Omri Berman, Nir Sochen, David Mendelovich, Iftach Klapp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infrared (IR) cameras are widely used for temperature measurements in various
+applications, including agriculture, medicine, and security. Low-cost IR camera
+have an immense potential to replace expansive radiometric cameras in these
+applications, however low-cost microbolometer-based IR cameras are prone to
+spatially-variant nonuniformity and to drift in temperature measurements, which
+limits their usability in practical scenarios.
+  To address these limitations, we propose a novel approach for simultaneous
+temperature estimation and nonuniformity correction from multiple frames
+captured by low-cost microbolometer-based IR cameras. We leverage the physical
+image acquisition model of the camera and incorporate it into a deep learning
+architecture called kernel estimation networks (KPN), which enables us to
+combine multiple frames despite imperfect registration between them. We also
+propose a novel offset block that incorporates the ambient temperature into the
+model and enables us to estimate the offset of the camera, which is a key
+factor in temperature estimation.
+  Our findings demonstrate that the number of frames has a significant impact
+on the accuracy of temperature estimation and nonuniformity correction.
+Moreover, our approach achieves a significant improvement in performance
+compared to vanilla KPN, thanks to the offset block. The method was tested on
+real data collected by a low-cost IR camera mounted on a UAV, showing only a
+small average error of $0.27^\circ C-0.54^\circ C$ relative to costly
+scientific-grade radiometric cameras.
+  Our method provides an accurate and efficient solution for simultaneous
+temperature estimation and nonuniformity correction, which has important
+implications for a wide range of practical applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TransHuman: A <span class="highlight-title">Transformer</span>-based Human Representation for Generalizable
+  Neural Human Rendering <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Pan, Zongxin Yang, Jianxin Ma, Chang Zhou, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we focus on the task of generalizable neural human rendering
+which trains conditional Neural Radiance Fields (NeRF) from multi-view videos
+of different characters. To handle the dynamic human motion, previous methods
+have primarily used a SparseConvNet (SPC)-based human representation to process
+the painted SMPL. However, such SPC-based representation i) optimizes under the
+volatile observation space which leads to the pose-misalignment between
+training and inference stages, and ii) lacks the global relationships among
+human parts that is critical for handling the incomplete painted SMPL. Tackling
+these issues, we present a brand-new framework named TransHuman, which learns
+the painted SMPL under the canonical space and captures the global
+relationships between human parts with transformers. Specifically, TransHuman
+is mainly composed of Transformer-based Human Encoding (TransHE), Deformable
+Partial Radiance Fields (DPaRF), and Fine-grained Detail Integration (FDI).
+TransHE first processes the painted SMPL under the canonical space via
+transformers for capturing the global relationships between human parts. Then,
+DPaRF binds each output token with a deformable radiance field for encoding the
+query point under the observation space. Finally, the FDI is employed to
+further integrate fine-grained information from reference images. Extensive
+experiments on ZJU-MoCap and H36M show that our TransHuman achieves a
+significantly new state-of-the-art performance with high efficiency. Project
+page: https://pansanity666.github.io/TransHuman/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Downstream-agnostic Adversarial Examples <span class="chip">ICCV '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqi Zhou, Shengshan Hu, Ruizhi Zhao, Qian Wang, Leo Yu Zhang, Junhui Hou, Hai Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning usually uses a large amount of unlabeled data to
+pre-train an encoder which can be used as a general-purpose feature extractor,
+such that downstream users only need to perform fine-tuning operations to enjoy
+the benefit of "large model". Despite this promising prospect, the security of
+pre-trained encoder has not been thoroughly investigated yet, especially when
+the pre-trained encoder is publicly available for commercial use.
+  In this paper, we propose AdvEncoder, the first framework for generating
+downstream-agnostic universal adversarial examples based on the pre-trained
+encoder. AdvEncoder aims to construct a universal adversarial perturbation or
+patch for a set of natural images that can fool all the downstream tasks
+inheriting the victim pre-trained encoder. Unlike traditional adversarial
+example works, the pre-trained encoder only outputs feature vectors rather than
+classification labels. Therefore, we first exploit the high frequency component
+information of the image to guide the generation of adversarial examples. Then
+we design a generative attack framework to construct adversarial
+perturbations/patches by learning the distribution of the attack surrogate
+dataset to improve their attack success rates and transferability. Our results
+show that an attacker can successfully attack downstream tasks without knowing
+either the pre-training dataset or the downstream dataset. We also tailor four
+defenses for pre-trained encoders, the results of which further prove the
+attack ability of AdvEncoder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by the International Conference on
+  Computer Vision (ICCV '23, October 2--6, 2023, Paris, France)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FDCT: Fast Depth Completion for Transparent Objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12274v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12274v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianan Li, Zhehan Chen, Huan Liu, Chen Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth completion is crucial for many robotic tasks such as autonomous
+driving, 3-D reconstruction, and manipulation. Despite the significant
+progress, existing methods remain computationally intensive and often fail to
+meet the real-time requirements of low-power robotic platforms. Additionally,
+most methods are designed for opaque objects and struggle with transparent
+objects due to the special properties of reflection and refraction. To address
+these challenges, we propose a Fast Depth Completion framework for Transparent
+objects (FDCT), which also benefits downstream tasks like object pose
+estimation. To leverage local information and avoid overfitting issues when
+integrating it with global information, we design a new fusion branch and
+shortcuts to exploit low-level features and a loss function to suppress
+overfitting. This results in an accurate and user-friendly depth rectification
+framework which can recover dense depth estimation from RGB-D images alone.
+Extensive experiments demonstrate that FDCT can run about 70 FPS with a higher
+accuracy than the state-of-the-art methods. We also demonstrate that FDCT can
+improve pose estimation in object grasping tasks. The source code is available
+at https://github.com/Nonmy/FDCT
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9pages,7figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Context Perception Parallel Decoder for Scene Text Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12270v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12270v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongkun Du, Zhineng Chen, Caiyan Jia, Xiaoting Yin, Chenxia Li, Yuning Du, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene text recognition (STR) methods have struggled to attain high accuracy
+and fast inference speed. Autoregressive (AR)-based STR model uses the
+previously recognized characters to decode the next character iteratively. It
+shows superiority in terms of accuracy. However, the inference speed is slow
+also due to this iteration. Alternatively, parallel decoding (PD)-based STR
+model infers all the characters in a single decoding pass. It has advantages in
+terms of inference speed but worse accuracy, as it is difficult to build a
+robust recognition context in such a pass. In this paper, we first present an
+empirical study of AR decoding in STR. In addition to constructing a new AR
+model with the top accuracy, we find out that the success of AR decoder lies
+also in providing guidance on visual context perception rather than language
+modeling as claimed in existing studies. As a consequence, we propose Context
+Perception Parallel Decoder (CPPD) to decode the character sequence in a single
+PD pass. CPPD devises a character counting module and a character ordering
+module. Given a text instance, the former infers the occurrence count of each
+character, while the latter deduces the character reading order and
+placeholders. Together with the character prediction task, they construct a
+context that robustly tells what the character sequence is and where the
+characters appear, well mimicking the context conveyed by AR decoding.
+Experiments on both English and Chinese benchmarks demonstrate that CPPD models
+achieve highly competitive accuracy. Moreover, they run approximately 7x faster
+than their AR counterparts, and are also among the fastest recognizers. The
+code will be released soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Building-road Collaborative Extraction from Remotely Sensed Images via
+  Cross-Interaction <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haonan Guo, Xin Su, Chen Wu, Bo Du, Liangpei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Buildings are the basic carrier of social production and human life; roads
+are the links that interconnect social networks. Building and road information
+has important application value in the frontier fields of regional coordinated
+development, disaster prevention, auto-driving, etc. Mapping buildings and
+roads from very high-resolution (VHR) remote sensing images have become a hot
+research topic. However, the existing methods often ignore the strong spatial
+correlation between roads and buildings and extract them in isolation. To fully
+utilize the complementary advantages between buildings and roads, we propose a
+building-road collaborative extraction method based on multi-task and
+cross-scale feature interaction to improve the accuracy of both tasks in a
+complementary way. A multi-task interaction module is proposed to interact
+information across tasks and preserve the unique information of each task,
+which tackle the seesaw phenomenon in multitask learning. By considering the
+variation in appearance and structure between buildings and roads, a
+cross-scale interaction module is designed to automatically learn the optimal
+reception field for different tasks. Compared with many existing methods that
+train each task individually, the proposed collaborative extraction method can
+utilize the complementary advantages between buildings and roads by the
+proposed inter-task and inter-scale feature interactions, and automatically
+select the optimal reception field for different tasks. Experiments on a wide
+range of urban and rural scenarios show that the proposed algorithm can achieve
+building-road extraction with outstanding performance and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages,9 figures, submitted to ISPRS Journal of Photogrammetry and
+  Remote Sensing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ResWCAE: Biometric Pattern Image Denoising Using Residual
+  Wavelet-Conditioned Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youzhi Liang, Wen Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The utilization of biometric authentication with pattern images is
+increasingly popular in compact Internet of Things (IoT) devices. However, the
+reliability of such systems can be compromised by image quality issues,
+particularly in the presence of high levels of noise. While state-of-the-art
+deep learning algorithms designed for generic image denoising have shown
+promise, their large number of parameters and lack of optimization for unique
+biometric pattern retrieval make them unsuitable for these devices and
+scenarios. In response to these challenges, this paper proposes a lightweight
+and robust deep learning architecture, the Residual Wavelet-Conditioned
+Convolutional Autoencoder (Res-WCAE) with a Kullback-Leibler divergence (KLD)
+regularization, designed specifically for fingerprint image denoising. Res-WCAE
+comprises two encoders - an image encoder and a wavelet encoder - and one
+decoder. Residual connections between the image encoder and decoder are
+leveraged to preserve fine-grained spatial features, where the bottleneck layer
+conditioned on the compressed representation of features obtained from the
+wavelet encoder using approximation and detail subimages in the
+wavelet-transform domain. The effectiveness of Res-WCAE is evaluated against
+several state-of-the-art denoising methods, and the experimental results
+demonstrate that Res-WCAE outperforms these methods, particularly for heavily
+degraded fingerprint images in the presence of high levels of noise. Overall,
+Res-WCAE shows promise as a solution to the challenges faced by biometric
+authentication systems in compact IoT devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Depression Detection via Head Motion Patterns 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12241v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12241v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Monika Gahalawat, Raul Fernandez Rojas, Tanaya Guha, Ramanathan Subramanian, Roland Goecke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While depression has been studied via multimodal non-verbal behavioural cues,
+head motion behaviour has not received much attention as a biomarker. This
+study demonstrates the utility of fundamental head-motion units, termed
+\emph{kinemes}, for depression detection by adopting two distinct approaches,
+and employing distinctive features: (a) discovering kinemes from head motion
+data corresponding to both depressed patients and healthy controls, and (b)
+learning kineme patterns only from healthy controls, and computing statistics
+derived from reconstruction errors for both the patient and control classes.
+Employing machine learning methods, we evaluate depression classification
+performance on the \emph{BlackDog} and \emph{AVEC2013} datasets. Our findings
+indicate that: (1) head motion patterns are effective biomarkers for detecting
+depressive symptoms, and (2) explanatory kineme patterns consistent with prior
+findings can be observed for the two classes. Overall, we achieve peak F1
+scores of 0.79 and 0.82, respectively, over BlackDog and AVEC2013 for binary
+classification over episodic \emph{thin-slices}, and a peak F1 of 0.72 over
+videos for AVEC2013.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DQ-Det: Learning Dynamic Query Combinations for <span class="highlight-title">Transformer</span>-based Object
+  Detection and Segmentation <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Cui, Linjie Yang, Haichao Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based detection and segmentation methods use a list of learned
+detection queries to retrieve information from the transformer network and
+learn to predict the location and category of one specific object from each
+query. We empirically find that random convex combinations of the learned
+queries are still good for the corresponding models. We then propose to learn a
+convex combination with dynamic coefficients based on the high-level semantics
+of the image. The generated dynamic queries, named modulated queries, better
+capture the prior of object locations and categories in the different images.
+Equipped with our modulated queries, a wide range of DETR-based models achieve
+consistent and superior performance across multiple tasks including object
+detection, instance segmentation, panoptic segmentation, and video instance
+segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures, ICML 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Modal Machine Learning for Assessing Gaming Skills in Online
+  Streaming: A Case Study with CS:GO 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longxiang Zhang, Wenping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online streaming is an emerging market that address much attention. Assessing
+gaming skills from videos is an important task for streaming service providers
+to discover talented gamers. Service providers require the information to offer
+customized recommendation and service promotion to their customers. Meanwhile,
+this is also an important multi-modal machine learning tasks since online
+streaming combines vision, audio and text modalities. In this study we begin by
+identifying flaws in the dataset and proceed to clean it manually. Then we
+propose several variants of latest end-to-end models to learn joint
+representation of multiple modalities. Through our extensive experimentation,
+we demonstrate the efficacy of our proposals. Moreover, we identify that our
+proposed models is prone to identifying users instead of learning meaningful
+representations. We purpose future work to address the issue in the end.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EchoGLAD: Hierarchical Graph Neural Networks for Left Ventricle Landmark
+  Detection on Echocardiograms <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masoud Mokhtari, Mobina Mahdavi, Hooman Vaseli, Christina Luong, Purang Abolmaesumi, Teresa S. M. Tsang, Renjie Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The functional assessment of the left ventricle chamber of the heart requires
+detecting four landmark locations and measuring the internal dimension of the
+left ventricle and the approximate mass of the surrounding muscle. The key
+challenge of automating this task with machine learning is the sparsity of
+clinical labels, i.e., only a few landmark pixels in a high-dimensional image
+are annotated, leading many prior works to heavily rely on isotropic label
+smoothing. However, such a label smoothing strategy ignores the anatomical
+information of the image and induces some bias. To address this challenge, we
+introduce an echocardiogram-based, hierarchical graph neural network (GNN) for
+left ventricle landmark detection (EchoGLAD). Our main contributions are: 1) a
+hierarchical graph representation learning framework for multi-resolution
+landmark detection via GNNs; 2) induced hierarchical supervision at different
+levels of granularity using a multi-level loss. We evaluate our model on a
+public and a private dataset under the in-distribution (ID) and
+out-of-distribution (OOD) settings. For the ID setting, we achieve the
+state-of-the-art mean absolute errors (MAEs) of 1.46 mm and 1.86 mm on the two
+datasets. Our model also shows better OOD generalization than prior works with
+a testing MAE of 4.3 mm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ASCON: Anatomy-aware Supervised Contrastive Learning Framework for
+  Low-dose CT Denoising <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Chen, Qi Gao, Yi Zhang, Hongming Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While various deep learning methods have been proposed for low-dose computed
+tomography (CT) denoising, most of them leverage the normal-dose CT images as
+the ground-truth to supervise the denoising process. These methods typically
+ignore the inherent correlation within a single CT image, especially the
+anatomical semantics of human tissues, and lack the interpretability on the
+denoising process. In this paper, we propose a novel Anatomy-aware Supervised
+CONtrastive learning framework, termed ASCON, which can explore the anatomical
+semantics for low-dose CT denoising while providing anatomical
+interpretability. The proposed ASCON consists of two novel designs: an
+efficient self-attention-based U-Net (ESAU-Net) and a multi-scale anatomical
+contrastive network (MAC-Net). First, to better capture global-local
+interactions and adapt to the high-resolution input, an efficient ESAU-Net is
+introduced by using a channel-wise self-attention mechanism. Second, MAC-Net
+incorporates a patch-wise non-contrastive module to capture inherent anatomical
+information and a pixel-wise contrastive module to maintain intrinsic
+anatomical consistency. Extensive experimental results on two public low-dose
+CT denoising datasets demonstrate superior performance of ASCON over
+state-of-the-art models. Remarkably, our ASCON provides anatomical
+interpretability for low-dose CT denoising for the first time. Source code is
+available at https://github.com/hao1635/ASCON.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Expediting Building Footprint Segmentation from High-resolution Remote
+  Sensing Images via progressive lenient supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12220v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12220v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haonan Guo, Bo Du, Chen Wu, Xin Su, Liangpei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The efficacy of building footprint segmentation from remotely sensed images
+has been hindered by model transfer effectiveness. Many existing building
+segmentation methods were developed upon the encoder-decoder architecture of
+U-Net, in which the encoder is finetuned from the newly developed backbone
+networks that are pre-trained on ImageNet. However, the heavy computational
+burden of the existing decoder designs hampers the successful transfer of these
+modern encoder networks to remote sensing tasks. Even the widely-adopted deep
+supervision strategy fails to mitigate these challenges due to its invalid loss
+in hybrid regions where foreground and background pixels are intermixed. In
+this paper, we conduct a comprehensive evaluation of existing decoder network
+designs for building footprint segmentation and propose an efficient framework
+denoted as BFSeg to enhance learning efficiency and effectiveness.
+Specifically, a densely-connected coarse-to-fine feature fusion decoder network
+that facilitates easy and fast feature fusion across scales is proposed.
+Moreover, considering the invalidity of hybrid regions in the down-sampled
+ground truth during the deep supervision process, we present a lenient deep
+supervision and distillation strategy that enables the network to learn proper
+knowledge from deep supervision. Building upon these advancements, we have
+developed a new family of building segmentation networks, which consistently
+surpass prior works with outstanding performance and efficiency across a wide
+range of newly developed encoder networks. The code will be released on
+https://github.com/HaonanGuo/BFSeg-Efficient-Building-Footprint-Segmentation-Framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages,8 figures. Submitted to IEEE Transactions on Neural Networks
+  and Learning Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LoLep: Single-View View Synthesis with Locally-Learned Planes and
+  Self-Attention Occlusion Inference <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Wang, Yu-Ping Wang, Dinesh Manocha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel method, LoLep, which regresses Locally-Learned planes from
+a single RGB image to represent scenes accurately, thus generating better novel
+views. Without the depth information, regressing appropriate plane locations is
+a challenging problem. To solve this issue, we pre-partition the disparity
+space into bins and design a disparity sampler to regress local offsets for
+multiple planes in each bin. However, only using such a sampler makes the
+network not convergent; we further propose two optimizing strategies that
+combine with different disparity distributions of datasets and propose an
+occlusion-aware reprojection loss as a simple yet effective geometric
+supervision technique. We also introduce a self-attention mechanism to improve
+occlusion inference and present a Block-Sampling Self-Attention (BS-SA) module
+to address the problem of applying self-attention to large feature maps. We
+demonstrate the effectiveness of our approach and generate state-of-the-art
+results on different datasets. Compared to MINE, our approach has an LPIPS
+reduction of 4.8%-9.0% and an RV reduction of 83.1%-84.7%. We also evaluate the
+performance on real-world images and demonstrate the benefits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepCL: Deep Change Feature Learning on Remote Sensing Images in the
+  Metric Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12208v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12208v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haonan Guo, Bo Du, Chen Wu, Chengxi Han, Liangpei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Change detection (CD) is an important yet challenging task in the Earth
+observation field for monitoring Earth surface dynamics. The advent of deep
+learning techniques has recently propelled automatic CD into a technological
+revolution. Nevertheless, deep learning-based CD methods are still plagued by
+two primary issues: 1) insufficient temporal relationship modeling and 2)
+pseudo-change misclassification. To address these issues, we complement the
+strong temporal modeling ability of metric learning with the prominent fitting
+ability of segmentation and propose a deep change feature learning (DeepCL)
+framework for robust and explainable CD. Firstly, we designed a hard
+sample-aware contrastive loss, which reweights the importance of hard and
+simple samples. This loss allows for explicit modeling of the temporal
+correlation between bi-temporal remote sensing images. Furthermore, the modeled
+temporal relations are utilized as knowledge prior to guide the segmentation
+process for detecting change regions. The DeepCL framework is thoroughly
+evaluated both theoretically and experimentally, demonstrating its superior
+feature discriminability, resilience against pseudo changes, and adaptability
+to a variety of CD algorithms. Extensive comparative experiments substantiate
+the quantitative and qualitative superiority of DeepCL over state-of-the-art CD
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages,7 figures, submitted to IEEE Transactions on Image
+  Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Magic123: One Image to High-Quality 3D Object Generation Using Both 2D
+  and 3D Diffusion Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17843v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17843v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guocheng Qian, Jinjie Mai, Abdullah Hamdi, Jian Ren, Aliaksandr Siarohin, Bing Li, Hsin-Ying Lee, Ivan Skorokhodov, Peter Wonka, Sergey Tulyakov, Bernard Ghanem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Magic123, a two-stage coarse-to-fine approach for high-quality,
+textured 3D meshes generation from a single unposed image in the wild using
+both2D and 3D priors. In the first stage, we optimize a neural radiance field
+to produce a coarse geometry. In the second stage, we adopt a memory-efficient
+differentiable mesh representation to yield a high-resolution mesh with a
+visually appealing texture. In both stages, the 3D content is learned through
+reference view supervision and novel views guided by a combination of 2D and 3D
+diffusion priors. We introduce a single trade-off parameter between the 2D and
+3D priors to control exploration (more imaginative) and exploitation (more
+precise) of the generated geometry. Additionally, we employ textual inversion
+and monocular depth regularization to encourage consistent appearances across
+views and to prevent degenerate solutions, respectively. Magic123 demonstrates
+a significant improvement over previous image-to-3D techniques, as validated
+through extensive experiments on synthetic benchmarks and diverse real-world
+images. Our code, models, and generated 3D assets are available at
+https://github.com/guochengqian/Magic123.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>webpage: https://guochengqian.github.io/project/magic123/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TokenFlow: Consistent Diffusion Features for Consistent Video Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10373v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10373v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michal Geyer, Omer Bar-Tal, Shai Bagon, Tali Dekel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generative AI revolution has recently expanded to videos. Nevertheless,
+current state-of-the-art video models are still lagging behind image models in
+terms of visual quality and user control over the generated content. In this
+work, we present a framework that harnesses the power of a text-to-image
+diffusion model for the task of text-driven video editing. Specifically, given
+a source video and a target text-prompt, our method generates a high-quality
+video that adheres to the target text, while preserving the spatial layout and
+motion of the input video. Our method is based on a key observation that
+consistency in the edited video can be obtained by enforcing consistency in the
+diffusion feature space. We achieve this by explicitly propagating diffusion
+features based on inter-frame correspondences, readily available in the model.
+Thus, our framework does not require any training or fine-tuning, and can work
+in conjunction with any off-the-shelf text-to-image editing method. We
+demonstrate state-of-the-art editing results on a variety of real-world videos.
+Webpage: https://diffusion-tokenflow.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What's Wrong with the Absolute Trajectory Error? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05376v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05376v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seong Hun Lee, Javier Civera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the limitations of the commonly used Absolute Trajectory Error (ATE)
+is that it is highly sensitive to outliers. As a result, in the presence of
+just a few outliers, it often fails to reflect the varying accuracy as the
+inlier trajectory error or the number of outliers varies. In this work, we
+propose an alternative error metric for evaluating the accuracy of the
+reconstructed camera trajectory. Our metric, named Discernible Trajectory Error
+(DTE), is computed in five steps: (1) Shift the ground-truth and estimated
+trajectories such that both of their geometric medians are located at the
+origin. (2) Rotate the estimated trajectory such that it minimizes the sum of
+geodesic distances between the corresponding camera orientations. (3) Scale the
+estimated trajectory such that the median distance of the cameras to their
+geometric median is the same as that of the ground truth. (4) Compute,
+winsorize and normalize the distances between the corresponding cameras. (5)
+Obtain the DTE by taking the average of the mean and the root-mean-square (RMS)
+of the resulting distances. This metric is an attractive alternative to the
+ATE, in that it is capable of discerning the varying trajectory accuracy as the
+inlier trajectory error or the number of outliers varies. Using the similar
+idea, we also propose a novel rotation error metric, named Discernible Rotation
+Error (DRE), which has similar advantages to the DTE. Furthermore, we propose a
+simple yet effective method for calibrating the camera-to-marker rotation,
+which is needed for the computation of our metrics. Our methods are verified
+through extensive simulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ELVIS: Empowering Locality of Vision Language <span class="highlight-title">Pre-train</span>ing with
+  Intra-modal Similarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05303v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05303v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sumin Seo, JaeWoong Shin, Jaewoo Kang, Tae Soo Kim, Thijs Kooi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has shown great potential in assisting radiologists in reading
+chest X-ray (CXR) images, but its need for expensive annotations for improving
+performance prevents widespread clinical application. Visual language
+pre-training (VLP) can alleviate the burden and cost of annotation by
+leveraging routinely generated reports for radiographs, which exist in large
+quantities as well as in paired form (image-text pairs). Additionally,
+extensions to localization-aware VLPs are being proposed to address the needs
+for accurate localization of abnormalities for computer-aided diagnosis (CAD)
+in CXR. However, we find that the formulation proposed by locality-aware VLP
+literature actually leads to a loss in spatial relationships required for
+downstream localization tasks. Therefore, we propose Empowering Locality of VLP
+with Intra-modal Similarity, ELVIS, a VLP aware of intra-modal locality, to
+better preserve the locality within radiographs or reports, which enhances the
+ability to comprehend location references in text reports. Our locality-aware
+VLP method significantly outperforms state-of-the art baselines in multiple
+segmentation tasks and the MS-CXR phrase grounding task. Qualitatively, we show
+that ELVIS focuses well on regions of interest described in the report text
+compared to prior approaches, allowing for enhanced interpretability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLIPTER: Looking at the Bigger Picture in Scene Text Recognition <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.07464v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.07464v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aviad Aberdam, David Bensaïd, Alona Golts, Roy Ganz, Oren Nuriel, Royee Tichauer, Shai Mazor, Ron Litman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reading text in real-world scenarios often requires understanding the context
+surrounding it, especially when dealing with poor-quality text. However,
+current scene text recognizers are unaware of the bigger picture as they
+operate on cropped text images. In this study, we harness the representative
+capabilities of modern vision-language models, such as CLIP, to provide
+scene-level information to the crop-based recognizer. We achieve this by fusing
+a rich representation of the entire image, obtained from the vision-language
+model, with the recognizer word-level features via a gated cross-attention
+mechanism. This component gradually shifts to the context-enhanced
+representation, allowing for stable fine-tuning of a pretrained recognizer. We
+demonstrate the effectiveness of our model-agnostic framework, CLIPTER (CLIP
+TExt Recognition), on leading text recognition architectures and achieve
+state-of-the-art results across multiple benchmarks. Furthermore, our analysis
+highlights improved robustness to out-of-vocabulary words and enhanced
+generalization in low-data regimes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RepViT: Revisiting Mobile CNN From ViT Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09283v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09283v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ao Wang, Hui Chen, Zijia Lin, Hengjun Pu, Guiguang Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, lightweight Vision Transformers (ViTs) demonstrate superior
+performance and lower latency compared with lightweight Convolutional Neural
+Networks (CNNs) on resource-constrained mobile devices. This improvement is
+usually attributed to the multi-head self-attention module, which enables the
+model to learn global representations. However, the architectural disparities
+between lightweight ViTs and lightweight CNNs have not been adequately
+examined. In this study, we revisit the efficient design of lightweight CNNs
+and emphasize their potential for mobile devices. We incrementally enhance the
+mobile-friendliness of a standard lightweight CNN, specifically MobileNetV3, by
+integrating the efficient architectural choices of lightweight ViTs. This ends
+up with a new family of pure lightweight CNNs, namely RepViT. Extensive
+experiments show that RepViT outperforms existing state-of-the-art lightweight
+ViTs and exhibits favorable latency in various vision tasks. On ImageNet,
+RepViT achieves over 80\% top-1 accuracy with nearly 1ms latency on an iPhone
+12, which is the first time for a lightweight model, to the best of our
+knowledge. Our largest model, RepViT-M3, obtains 81.4\% accuracy with only
+1.3ms latency. The code and trained models are available at
+\url{https://github.com/jameslahm/RepViT}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Sparse to Precise: A Practical Editing Approach for Intracardiac
+  Echocardiography Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11041v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11041v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed H. Shahin, Yan Zhuang, Noha El-Zehiry
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and safe catheter ablation procedures for patients with atrial
+fibrillation require precise segmentation of cardiac structures in Intracardiac
+Echocardiography (ICE) imaging. Prior studies have suggested methods that
+employ 3D geometry information from the ICE transducer to create a sparse ICE
+volume by placing 2D frames in a 3D grid, enabling training of 3D segmentation
+models. However, the resulting 3D masks from these models can be inaccurate and
+may lead to serious clinical complications due to the sparse sampling in ICE
+data, frames misalignment, and cardiac motion. To address this issue, we
+propose an interactive editing framework that allows users to edit segmentation
+output by drawing scribbles on a 2D frame. The user interaction is mapped to
+the 3D grid and utilized to execute an editing step that modifies the
+segmentation in the vicinity of the interaction while preserving the previous
+segmentation away from the interaction. Furthermore, our framework accommodates
+multiple edits to the segmentation output in a sequential manner without
+compromising previous edits. This paper presents a novel loss function and a
+novel evaluation metric specifically designed for editing. Results from
+cross-validation and testing indicate that our proposed loss function
+outperforms standard losses and training strategies in terms of segmentation
+quality and following user input. Additionally, we show quantitatively and
+qualitatively that subsequent edits do not compromise previous edits when using
+our method, as opposed to standard segmentation losses. Overall, our approach
+enhances the accuracy of the segmentation while avoiding undesired changes away
+from user interactions and without compromising the quality of previously
+edited regions, leading to better patient outcomes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Open Vocabulary Learning: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15880v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15880v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianzong Wu, Xiangtai Li, Shilin Xu, Haobo Yuan, Henghui Ding, Yibo Yang, Xia Li, Jiangning Zhang, Yunhai Tong, Xudong Jiang, Bernard Ghanem, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of visual scene understanding, deep neural networks have made
+impressive advancements in various core tasks like segmentation, tracking, and
+detection. However, most approaches operate on the close-set assumption,
+meaning that the model can only identify pre-defined categories that are
+present in the training set. Recently, open vocabulary settings were proposed
+due to the rapid progress of vision language pre-training. These new approaches
+seek to locate and recognize categories beyond the annotated label space. The
+open vocabulary approach is more general, practical, and effective compared to
+weakly supervised and zero-shot settings. This paper provides a thorough review
+of open vocabulary learning, summarizing and analyzing recent developments in
+the field. In particular, we begin by comparing it to related concepts such as
+zero-shot learning, open-set recognition, and out-of-distribution detection.
+Then, we review several closely related tasks in the case of segmentation and
+detection, including long-tail problems, few-shot, and zero-shot settings. For
+the method survey, we first present the basic knowledge of detection and
+segmentation in close-set as the preliminary knowledge. Next, we examine
+various scenarios in which open vocabulary learning is used, identifying common
+design elements and core ideas. Then, we compare the recent detection and
+segmentation approaches in commonly used datasets and benchmarks. Finally, we
+conclude with insights, issues, and discussions regarding future research
+directions. To our knowledge, this is the first comprehensive literature review
+of open vocabulary learning. We keep tracing related works at
+https://github.com/jianzongwu/Awesome-Open-Vocabulary.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://github.com/jianzongwu/Awesome-Open-Vocabulary</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Betrayed by Captions: Joint Caption Grounding and Generation for Open
+  Vocabulary Instance Segmentation <span class="chip">ICCV-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.00805v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.00805v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianzong Wu, Xiangtai Li, Henghui Ding, Xia Li, Guangliang Cheng, Yunhai Tong, Chen Change Loy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we focus on open vocabulary instance segmentation to expand a
+segmentation model to classify and segment instance-level novel categories.
+Previous approaches have relied on massive caption datasets and complex
+pipelines to establish one-to-one mappings between image regions and words in
+captions. However, such methods build noisy supervision by matching non-visible
+words to image regions, such as adjectives and verbs. Meanwhile, context words
+are also important for inferring the existence of novel objects as they show
+high inter-correlations with novel categories. To overcome these limitations,
+we devise a joint \textbf{Caption Grounding and Generation (CGG)} framework,
+which incorporates a novel grounding loss that only focuses on matching object
+nouns to improve learning efficiency. We also introduce a caption generation
+head that enables additional supervision and contextual modeling as a
+complementation to the grounding loss. Our analysis and results demonstrate
+that grounding and generation components complement each other, significantly
+enhancing the segmentation performance for novel classes. Experiments on the
+COCO dataset with two settings: Open Vocabulary Instance Segmentation (OVIS)
+and Open Set Panoptic Segmentation (OSPS) demonstrate the superiority of the
+CGG. Specifically, CGG achieves a substantial improvement of 6.8% mAP for novel
+classes without extra data on the OVIS task and 15% PQ improvements for novel
+classes on the OSPS benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Robust Referring Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.09554v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.09554v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianzong Wu, Xiangtai Li, Xia Li, Henghui Ding, Yunhai Tong, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Referring Image Segmentation (RIS) is a fundamental vision-language task that
+outputs object masks based on text descriptions. Many works have achieved
+considerable progress for RIS, including different fusion method designs. In
+this work, we explore an essential question, ``What if the text description is
+wrong or misleading?'' For example, the described objects are not in the image.
+We term such a sentence as a negative sentence. However, existing solutions for
+RIS cannot handle such a setting. To this end, we propose a new formulation of
+RIS, named Robust Referring Image Segmentation (R-RIS). It considers the
+negative sentence inputs besides the regular positive text inputs. To
+facilitate this new task, we create three R-RIS datasets by augmenting existing
+RIS datasets with negative sentences and propose new metrics to evaluate both
+types of inputs in a unified manner. Furthermore, we propose a new
+transformer-based model, called RefSegformer, with a token-based vision and
+language fusion module. Our design can be easily extended to our R-RIS setting
+by adding extra blank tokens. Our proposed RefSegformer achieves
+state-of-the-art results on both RIS and R-RIS datasets, establishing a solid
+baseline for both settings. Our project page is at
+\url{https://github.com/jianzongwu/robust-ref-seg}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>update more results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PDPP:Projected Diffusion for Procedure Planning in Instructional Videos <span class="chip">CVPR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.14676v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.14676v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanlin Wang, Yilu Wu, Sheng Guo, Limin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study the problem of procedure planning in instructional
+videos, which aims to make goal-directed plans given the current visual
+observations in unstructured real-life videos. Previous works cast this problem
+as a sequence planning problem and leverage either heavy intermediate visual
+observations or natural language instructions as supervision, resulting in
+complex learning schemes and expensive annotation costs. In contrast, we treat
+this problem as a distribution fitting problem. In this sense, we model the
+whole intermediate action sequence distribution with a diffusion model (PDPP),
+and thus transform the planning problem to a sampling process from this
+distribution. In addition, we remove the expensive intermediate supervision,
+and simply use task labels from instructional videos as supervision instead.
+Our model is a U-Net based diffusion model, which directly samples action
+sequences from the learned distribution with the given start and end
+observations. Furthermore, we apply an efficient projection method to provide
+accurate conditional guides for our model during the learning and sampling
+process. Experiments on three datasets with different scales show that our PDPP
+model can achieve the state-of-the-art performance on multiple metrics, even
+without the task supervision. Code and trained models are available at
+https://github.com/MCG-NJU/PDPP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a highlight paper at CVPR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Open-Vocabulary Affordance Detection in 3D Point Clouds <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.02401v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.02401v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Toan Nguyen, Minh Nhat Vu, An Vuong, Dzung Nguyen, Thieu Vo, Ngan Le, Anh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Affordance detection is a challenging problem with a wide variety of robotic
+applications. Traditional affordance detection methods are limited to a
+predefined set of affordance labels, hence potentially restricting the
+adaptability of intelligent robots in complex and dynamic environments. In this
+paper, we present the Open-Vocabulary Affordance Detection (OpenAD) method,
+which is capable of detecting an unbounded number of affordances in 3D point
+clouds. By simultaneously learning the affordance text and the point feature,
+OpenAD successfully exploits the semantic relationships between affordances.
+Therefore, our proposed method enables zero-shot detection and can be able to
+detect previously unseen affordances without a single annotation example.
+Intensive experimental results show that OpenAD works effectively on a wide
+range of affordance detection setups and outperforms other baselines by a large
+margin. Additionally, we demonstrate the practicality of the proposed OpenAD in
+real-world robotic applications with a fast inference speed (~100ms). Our
+project is available at https://openad2023.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at The 2023 IEEE/RSJ International Conference on Intelligent
+  Robots and Systems (IROS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Novel Category Discovery Over Domains with Soft Contrastive
+  Learning and All-in-One Classifier <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.11262v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.11262v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zelin Zang, Lei Shang, Senqiao Yang, Fei Wang, Baigui Sun, Xuansong Xie, Stan Z. Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised domain adaptation (UDA) has proven to be highly effective in
+transferring knowledge from a label-rich source domain to a label-scarce target
+domain. However, the presence of additional novel categories in the target
+domain has led to the development of open-set domain adaptation (ODA) and
+universal domain adaptation (UNDA). Existing ODA and UNDA methods treat all
+novel categories as a single, unified unknown class and attempt to detect it
+during training. However, we found that domain variance can lead to more
+significant view-noise in unsupervised data augmentation, which affects the
+effectiveness of contrastive learning (CL) and causes the model to be
+overconfident in novel category discovery. To address these issues, a framework
+named Soft-contrastive All-in-one Network (SAN) is proposed for ODA and UNDA
+tasks. SAN includes a novel data-augmentation-based soft contrastive learning
+(SCL) loss to fine-tune the backbone for feature transfer and a more
+human-intuitive classifier to improve new class discovery capability. The SCL
+loss weakens the adverse effects of the data augmentation view-noise problem
+which is amplified in domain transfer tasks. The All-in-One (AIO) classifier
+overcomes the overconfidence problem of current mainstream closed-set and
+open-set classifiers. Visualization and ablation experiments demonstrate the
+effectiveness of the proposed innovations. Furthermore, extensive experiment
+results on ODA and UNDA show that SAN outperforms existing state-of-the-art
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GMA3D: Local-Global Attention Learning to Estimate Occluded Motions of
+  Scene Flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.03296v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.03296v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyang Lu, Ming Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene flow represents the motion information of each point in the 3D point
+clouds. It is a vital downstream method applied to many tasks, such as motion
+segmentation and object tracking. However, there are always occlusion points
+between two consecutive point clouds, whether from the sparsity data sampling
+or real-world occlusion. In this paper, we focus on addressing occlusion issues
+in scene flow by the semantic self-similarity and motion consistency of the
+moving objects. We propose a GMA3D module based on the transformer framework,
+which utilizes local and global semantic similarity to infer the motion
+information of occluded points from the motion information of local and global
+non-occluded points respectively, and then uses an offset aggregator to
+aggregate them. Our module is the first to apply the transformer-based
+architecture to gauge the scene flow occlusion problem on point clouds.
+Experiments show that our GMA3D can solve the occlusion problem in the scene
+flow, especially in the real scene. We evaluated the proposed method on the
+occluded version of point cloud datasets and get state-of-the-art results on
+the real scene KITTI dataset. To testify that GMA3D is still beneficial to
+non-occluded scene flow, we also conducted experiments on non-occluded version
+datasets and achieved promising performance on FlyThings3D and KITTI. The code
+is available at https://anonymous.4open.science/r/GMA3D-E100.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">6</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interface Design to Mitigate Inflation in Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12424v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12424v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rana Shahout, Yehonatan Peisakhovsky, Sasha Stoikov, Nikhil Garg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommendation systems rely on user-provided data to learn about item quality
+and provide personalized recommendations. An implicit assumption when
+aggregating ratings into item quality is that ratings are strong indicators of
+item quality. In this work, we test this assumption using data collected from a
+music discovery application. Our study focuses on two factors that cause rating
+inflation: heterogeneous user rating behavior and the dynamics of personalized
+recommendations. We show that user rating behavior substantially varies by
+user, leading to item quality estimates that reflect the users who rated an
+item more than the item quality itself. Additionally, items that are more
+likely to be shown via personalized recommendations can experience a
+substantial increase in their exposure and potential bias toward them. To
+mitigate these effects, we analyze the results of a randomized controlled trial
+in which the rating interface was modified. The test resulted in a substantial
+improvement in user rating behavior and a reduction in item quality inflation.
+These findings highlight the importance of carefully considering the
+assumptions underlying recommendation systems and designing interfaces that
+encourage accurate rating behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RANSAC-NN: Unsupervised Image Outlier Detection using RANSAC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12301v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12301v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen-Han Tsai, Yu-Shao Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image outlier detection (OD) is crucial for ensuring the quality and accuracy
+of image datasets used in computer vision tasks. The majority of OD algorithms,
+however, have not been targeted toward image data. Consequently, the results of
+applying such algorithms to images are often suboptimal. In this work, we
+propose RANSAC-NN, a novel unsupervised OD algorithm specifically designed for
+images. By comparing images in a RANSAC-based approach, our algorithm
+automatically predicts the outlier score of each image without additional
+training or label information. We evaluate RANSAC-NN against state-of-the-art
+OD algorithms on 15 diverse datasets. Without any hyperparameter tuning,
+RANSAC-NN consistently performs favorably in contrast to other algorithms in
+almost every dataset category. Furthermore, we provide a detailed analysis to
+understand each RANSAC-NN component, and we demonstrate its potential
+applications in image mislabeled detection. Code for RANSAC-NN is provided at
+https://github.com/mxtsai/ransac-nn
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recall, Robustness, and Lexicographic Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11370v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11370v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernando Diaz, Bhaskar Mitra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Researchers use recall to evaluate rankings across a variety of retrieval,
+recommendation, and machine learning tasks. While there is a colloquial
+interpretation of recall in set-based evaluation, the research community is far
+from a principled understanding of recall metrics for rankings. The lack of
+principled understanding of or motivation for recall has resulted in criticism
+amongst the retrieval community that recall is useful as a measure at all. In
+this light, we reflect on the measurement of recall in rankings from a formal
+perspective. Our analysis is composed of three tenets: recall, robustness, and
+lexicographic evaluation. First, we formally define `recall-orientation' as
+sensitivity to movement of the bottom-ranked relevant item. Second, we analyze
+our concept of recall orientation from the perspective of robustness with
+respect to possible searchers and content providers. Finally, we extend this
+conceptual and theoretical treatment of recall by developing a practical
+preference-based evaluation method based on lexicographic comparison. Through
+extensive empirical analysis across 17 TREC tracks, we establish that our new
+evaluation method, lexirecall, is correlated with existing recall metrics and
+exhibits substantially higher discriminative power and stability in the
+presence of missing labels. Our conceptual, theoretical, and empirical analysis
+substantially deepens our understanding of recall and motivates its adoption
+through connections to robustness and fairness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comparing Apples to Apples: Generating Aspect-Aware Comparative
+  Sentences from User <span class="highlight-title">Review</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03691v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03691v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jessica Echterhoff, An Yan, Julian McAuley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is time-consuming to find the best product among many similar
+alternatives. Comparative sentences can help to contrast one item from others
+in a way that highlights important features of an item that stand out. Given
+reviews of one or multiple items and relevant item features, we generate
+comparative review sentences to aid users to find the best fit. Specifically,
+our model consists of three successive components in a transformer: (i) an item
+encoding module to encode an item for comparison, (ii) a comparison generation
+module that generates comparative sentences in an autoregressive manner, (iii)
+a novel decoding method for user personalization. We show that our pipeline
+generates fluent and diverse comparative sentences. We run experiments on the
+relevance and fidelity of our generated sentences in a human evaluation study
+and find that our algorithm creates comparative review sentences that are
+relevant and truthful.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Investigating the Factual Knowledge Boundary of Large Language Models
+  with Retrieval Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11019v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11019v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiyang Ren, Yuhao Wang, Yingqi Qu, Wayne Xin Zhao, Jing Liu, Hao Tian, Hua Wu, Ji-Rong Wen, Haifeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge-intensive tasks (e.g., open-domain question answering (QA)) require
+a substantial amount of factual knowledge and often rely on external
+information for assistance. Recently, large language models (LLMs) (e.g.,
+ChatGPT), have demonstrated impressive prowess in solving a wide range of tasks
+with world knowledge, including knowledge-intensive tasks. However, it remains
+unclear how well LLMs are able to perceive their factual knowledge boundaries,
+particularly how they behave when incorporating retrieval augmentation. In this
+study, we present an initial analysis of the factual knowledge boundaries of
+LLMs and how retrieval augmentation affects LLMs on open-domain QA. Specially,
+we focus on three primary research questions and analyze them by examining QA
+performance, priori judgement and posteriori judgement of LLMs. We show
+evidence that LLMs possess unwavering confidence in their capabilities to
+respond to questions and the accuracy of their responses. Furthermore,
+retrieval augmentation proves to be an effective approach in enhancing LLMs'
+awareness of knowledge boundaries, thereby improving their judgemental
+abilities. Additionally, we also find that LLMs have a propensity to rely on
+the provided retrieval results when formulating answers, while the quality of
+these results significantly impacts their reliance. The code to reproduce this
+work is available at https://github.com/RUCAIBox/LLM-Knowledge-Boundary.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ICPE: An Item Cluster-Wise Pareto-Efficient Framework for Recommendation
+  Debiasing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2109.12887v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2109.12887v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yule Wang, Xin Xin, Yue Ding, Yunzhe Li, Dong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender system based on historical user-item interactions is of vital
+importance for web-based services. However, the observed data used to train the
+recommender model suffers from severe bias issues. Practically, the item
+frequency distribution of the dataset is a highly skewed power-law
+distribution. Interactions of a small fraction of head items account for almost
+the whole training data. The normal training paradigm from such biased data
+tends to repetitively generate recommendations from the head items, which
+further exacerbates the biases and affects the exploration of potentially
+interesting items from the niche set. In this work, we innovatively explore the
+central theme of recommendation debiasing from an item cluster-wise
+multi-objective optimization perspective. Aiming to balance the learning on
+various item clusters that differ in popularity during the training process, we
+propose a model-agnostic framework namely Item Cluster-Wise Pareto-Efficient
+Recommendation (ICPE). In detail, we define our item cluster-wise optimization
+target as the recommender model should balance all item clusters that differ in
+popularity, thus we set the model learning on each item cluster as a unique
+optimization objective. To achieve this goal, we first explore items'
+popularity levels from a novel causal reasoning perspective. Then, we devise
+popularity discrepancy-based bisecting clustering to separate the item
+clusters. Next, we adaptively find the overall harmonious gradient direction
+for cluster-wise optimization objectives from a Pareto-efficient solver.
+Finally, in the prediction stage, we perform counterfactual inference to
+further eliminate the impact of global propensity. Extensive experimental
+results verify the superiorities of ICPE on overall recommendation performance
+and biases elimination.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">42</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Information-theoretic Analysis of Test Data Sensitivity in Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12456v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12456v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Futoshi Futami, Tomoharu Iwata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian inference is often utilized for uncertainty quantification tasks. A
+recent analysis by Xu and Raginsky 2022 rigorously decomposed the predictive
+uncertainty in Bayesian inference into two uncertainties, called aleatoric and
+epistemic uncertainties, which represent the inherent randomness in the
+data-generating process and the variability due to insufficient data,
+respectively. They analyzed those uncertainties in an information-theoretic
+way, assuming that the model is well-specified and treating the model's
+parameters as latent variables. However, the existing information-theoretic
+analysis of uncertainty cannot explain the widely believed property of
+uncertainty, known as the sensitivity between the test and training data. It
+implies that when test data are similar to training data in some sense, the
+epistemic uncertainty should become small. In this work, we study such
+uncertainty sensitivity using our novel decomposition method for the predictive
+uncertainty. Our analysis successfully defines such sensitivity using
+information-theoretic quantities. Furthermore, we extend the existing analysis
+of Bayesian meta-learning and show the novel sensitivities among tasks for the
+first time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiAMoNDBack: Diffusion-denoising Autoregressive Model for
+  Non-Deterministic Backmapping of Cα Protein Traces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael S. Jones, Kirill Shmilovich, Andrew L. Ferguson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coarse-grained molecular models of proteins permit access to length and time
+scales unattainable by all-atom models and the simulation of processes that
+occur on long-time scales such as aggregation and folding. The reduced
+resolution realizes computational accelerations but an atomistic representation
+can be vital for a complete understanding of mechanistic details. Backmapping
+is the process of restoring all-atom resolution to coarse-grained molecular
+models. In this work, we report DiAMoNDBack (Diffusion-denoising Autoregressive
+Model for Non-Deterministic Backmapping) as an autoregressive denoising
+diffusion probability model to restore all-atom details to coarse-grained
+protein representations retaining only C{\alpha} coordinates. The
+autoregressive generation process proceeds from the protein N-terminus to
+C-terminus in a residue-by-residue fashion conditioned on the C{\alpha} trace
+and previously backmapped backbone and side chain atoms within the local
+neighborhood. The local and autoregressive nature of our model makes it
+transferable between proteins. The stochastic nature of the denoising diffusion
+process means that the model generates a realistic ensemble of backbone and
+side chain all-atom configurations consistent with the coarse-grained C{\alpha}
+trace. We train DiAMoNDBack over 65k+ structures from Protein Data Bank (PDB)
+and validate it in applications to a hold-out PDB test set,
+intrinsically-disordered protein structures from the Protein Ensemble Database
+(PED), molecular dynamics simulations of fast-folding mini-proteins from DE
+Shaw Research, and coarse-grained simulation data. We achieve state-of-the-art
+reconstruction performance in terms of correct bond formation, avoidance of
+side chain clashes, and diversity of the generated side chain configurational
+states. We make DiAMoNDBack model publicly available as a free and open source
+Python package.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProtoFL: Unsupervised Federated Learning via Prototypical Distillation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansol Kim, Youngjun Kwak, Minyoung Jung, Jinho Shin, Youngsung Kim, Changick Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is a promising approach for enhancing data privacy
+preservation, particularly for authentication systems. However, limited round
+communications, scarce representation, and scalability pose significant
+challenges to its deployment, hindering its full potential. In this paper, we
+propose 'ProtoFL', Prototypical Representation Distillation based unsupervised
+Federated Learning to enhance the representation power of a global model and
+reduce round communication costs. Additionally, we introduce a local one-class
+classifier based on normalizing flows to improve performance with limited data.
+Our study represents the first investigation of using FL to improve one-class
+classification performance. We conduct extensive experiments on five widely
+used benchmarks, namely MNIST, CIFAR-10, CIFAR-100, ImageNet-30, and
+Keystroke-Dynamics, to demonstrate the superior performance of our proposed
+framework over previous methods in the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023. Hansol Kim and Youngjun Kwak contributed
+  equally to this work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WEPRO: Weight Prediction for Efficient Optimization of Hybrid
+  Quantum-Classical Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12449v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12449v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Satwik Kundu, Debarshi Kundu, Swaroop Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The exponential run time of quantum simulators on classical machines and long
+queue depths and high costs of real quantum devices present significant
+challenges in the effective training of Variational Quantum Algorithms (VQAs)
+like Quantum Neural Networks (QNNs), Variational Quantum Eigensolver (VQE) and
+Quantum Approximate Optimization Algorithm (QAOA). To address these
+limitations, we propose a new approach, WEPRO (Weight Prediction), which
+accelerates the convergence of VQAs by exploiting regular trends in the
+parameter weights. We introduce two techniques for optimal prediction
+performance namely, Naive Prediction (NaP) and Adaptive Prediction (AdaP).
+Through extensive experimentation and training of multiple QNN models on
+various datasets, we demonstrate that WEPRO offers a speedup of approximately
+$2.25\times$ compared to standard training methods, while also providing
+improved accuracy (up to $2.3\%$ higher) and loss (up to $6.1\%$ lower) with
+low storage and computational overheads. We also evaluate WEPRO's effectiveness
+in VQE for molecular ground-state energy estimation and in QAOA for graph
+MaxCut. Our results show that WEPRO leads to speed improvements of up to
+$3.1\times$ for VQE and $2.91\times$ for QAOA, compared to traditional
+optimization techniques, while using up to $3.3\times$ less number of shots
+(i.e., repeated circuit executions) per training iteration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multifidelity Covariance Estimation via Regression on the Manifold of
+  Symmetric Positive Definite Matrices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12438v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12438v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aimee Maurais, Terrence Alsup, Benjamin Peherstorfer, Youssef Marzouk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a multifidelity estimator of covariance matrices formulated as
+the solution to a regression problem on the manifold of symmetric positive
+definite matrices. The estimator is positive definite by construction, and the
+Mahalanobis distance minimized to obtain it possesses properties which enable
+practical computation. We show that our manifold regression multifidelity
+(MRMF) covariance estimator is a maximum likelihood estimator under a certain
+error model on manifold tangent space. More broadly, we show that our
+Riemannian regression framework encompasses existing multifidelity covariance
+estimators constructed from control variates. We demonstrate via numerical
+examples that our estimator can provide significant decreases, up to one order
+of magnitude, in squared estimation error relative to both single-fidelity and
+other multifidelity covariance estimators. Furthermore, preservation of
+positive definiteness ensures that our estimator is compatible with downstream
+tasks, such as data assimilation and metric learning, in which this property is
+essential.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages + 15-page supplement</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Generalized Schwarz-type Non-overlapping Domain Decomposition Method
+  using Physics-constrained Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12435v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12435v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shamsulhaq Basir, Inanc Senocak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a meshless Schwarz-type non-overlapping domain decomposition
+method based on artificial neural networks for solving forward and inverse
+problems involving partial differential equations (PDEs). To ensure the
+consistency of solutions across neighboring subdomains, we adopt a generalized
+Robin-type interface condition, assigning unique Robin parameters to each
+subdomain. These subdomain-specific Robin parameters are learned to minimize
+the mismatch on the Robin interface condition, facilitating efficient
+information exchange during training. Our method is applicable to both the
+Laplace's and Helmholtz equations. It represents local solutions by an
+independent neural network model which is trained to minimize the loss on the
+governing PDE while strictly enforcing boundary and interface conditions
+through an augmented Lagrangian formalism. A key strength of our method lies in
+its ability to learn a Robin parameter for each subdomain, thereby enhancing
+information exchange with its neighboring subdomains. We observe that the
+learned Robin parameters adapt to the local behavior of the solution, domain
+partitioning and subdomain location relative to the overall domain. Extensive
+experiments on forward and inverse problems, including one-way and two-way
+decompositions with crosspoints, demonstrate the versatility and performance of
+our proposed approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Augmented Box Replay: Overcoming Foreground Shift for Incremental Object
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12427v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12427v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liu Yuyang, Cong Yang, Goswami Dipam, Liu Xialei, Joost van de Weijer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In incremental learning, replaying stored samples from previous tasks
+together with current task samples is one of the most efficient approaches to
+address catastrophic forgetting. However, unlike incremental classification,
+image replay has not been successfully applied to incremental object detection
+(IOD). In this paper, we identify the overlooked problem of foreground shift as
+the main reason for this. Foreground shift only occurs when replaying images of
+previous tasks and refers to the fact that their background might contain
+foreground objects of the current task. To overcome this problem, a novel and
+efficient Augmented Box Replay (ABR) method is developed that only stores and
+replays foreground objects and thereby circumvents the foreground shift
+problem. In addition, we propose an innovative Attentive RoI Distillation loss
+that uses spatial attention from region-of-interest (RoI) features to constrain
+current model to focus on the most important information from old model. ABR
+significantly reduces forgetting of previous classes while maintaining high
+plasticity in current classes. Moreover, it considerably reduces the storage
+requirements when compared to standard image replay. Comprehensive experiments
+on Pascal-VOC and COCO datasets support the state-of-the-art performance of our
+model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Practical Commercial 5G Standalone (SA) Uplink Throughput Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kasidis Arunruangsirilert, Jiro Katto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the 5G New Radio (NR) network promises a huge uplift of the uplink
+throughput, the improvement can only be seen when the User Equipment (UE) is
+connected to the high-frequency millimeter wave (mmWave) band. With the rise of
+uplink-intensive smartphone applications such as the real-time transmission of
+UHD 4K/8K videos, and Virtual Reality (VR)/Augmented Reality (AR) contents,
+uplink throughput prediction plays a huge role in maximizing the users' quality
+of experience (QoE). In this paper, we propose using a ConvLSTM-based neural
+network to predict the future uplink throughput based on past uplink throughput
+and RF parameters. The network is trained using the data from real-world drive
+tests on commercial 5G SA networks while riding commuter trains, which
+accounted for various frequency bands, handover, and blind spots. To make sure
+our model can be practically implemented, we then limited our model to only use
+the information available via Android API, then evaluate our model using the
+data from both commuter trains and other methods of transportation. The results
+show that our model reaches an average prediction accuracy of 98.9\% with an
+average RMSE of 1.80 Mbps across all unseen evaluation scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Machine Learning Approach to Two-Stage Adaptive Robust Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitris Bertsimas, Cheol Woo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an approach based on machine learning to solve two-stage linear
+adaptive robust optimization (ARO) problems with binary here-and-now variables
+and polyhedral uncertainty sets. We encode the optimal here-and-now decisions,
+the worst-case scenarios associated with the optimal here-and-now decisions,
+and the optimal wait-and-see decisions into what we denote as the strategy. We
+solve multiple similar ARO instances in advance using the column and constraint
+generation algorithm and extract the optimal strategies to generate a training
+set. We train a machine learning model that predicts high-quality strategies
+for the here-and-now decisions, the worst-case scenarios associated with the
+optimal here-and-now decisions, and the wait-and-see decisions. We also
+introduce an algorithm to reduce the number of different target classes the
+machine learning algorithm needs to be trained on. We apply the proposed
+approach to the facility location, the multi-item inventory control and the
+unit commitment problems. Our approach solves ARO problems drastically faster
+than the state-of-the-art algorithms with high accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Control of Multiclass Fluid Queueing Networks: A Machine
+  Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12405v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12405v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitris Bertsimas, Cheol Woo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a machine learning approach to the optimal control of multiclass
+fluid queueing networks (MFQNETs) that provides explicit and insightful control
+policies. We prove that a threshold type optimal policy exists for MFQNET
+control problems, where the threshold curves are hyperplanes passing through
+the origin. We use Optimal Classification Trees with hyperplane splits (OCT-H)
+to learn an optimal control policy for MFQNETs. We use numerical solutions of
+MFQNET control problems as a training set and apply OCT-H to learn explicit
+control policies. We report experimental results with up to 33 servers and 99
+classes that demonstrate that the learned policies achieve 100\% accuracy on
+the test set. While the offline training of OCT-H can take days in large
+networks, the online application takes milliseconds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty-aware Grounded Action Transformation towards Sim-to-Real
+  Transfer for Traffic Signal Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12388v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12388v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longchao Da, Hao Mei, Romir Sharma, Hua Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic signal control (TSC) is a complex and important task that affects the
+daily lives of millions of people. Reinforcement Learning (RL) has shown
+promising results in optimizing traffic signal control, but current RL-based
+TSC methods are mainly trained in simulation and suffer from the performance
+gap between simulation and the real world. In this paper, we propose a
+simulation-to-real-world (sim-to-real) transfer approach called UGAT, which
+transfers a learned policy trained from a simulated environment to a real-world
+environment by dynamically transforming actions in the simulation with
+uncertainty to mitigate the domain gap of transition dynamics. We evaluate our
+method on a simulated traffic environment and show that it significantly
+improves the performance of the transferred RL policy in the real world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Learning in Large Language Models Learns Label Relationships
+  but Is Not Conventional Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12375v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12375v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannik Kossen, Tom Rainforth, Yarin Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of Large Language Models (LLMs) on downstream tasks often
+improves significantly when including examples of the input-label relationship
+in the context. However, there is currently no consensus about how this
+in-context learning (ICL) ability of LLMs works: for example, while Xie et al.
+(2021) liken ICL to a general-purpose learning algorithm, Min et al. (2022b)
+argue ICL does not even learn label relationships from in-context examples. In
+this paper, we study (1) how labels of in-context examples affect predictions,
+(2) how label relationships learned during pre-training interact with
+input-label examples provided in-context, and (3) how ICL aggregates label
+information across in-context examples. Our findings suggests LLMs usually
+incorporate information from in-context labels, but that pre-training and
+in-context label relationships are treated differently, and that the model does
+not consider all in-context information equally. Our results give insights into
+understanding and aligning LLM behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early Prediction of Alzheimers Disease Leveraging Symptom Occurrences
+  from Longitudinal Electronic Health Records of US Military Veterans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12369v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12369v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rumeng Li, Xun Wang, Dan Berlowitz, Brian Silver, Wen Hu, Heather Keating, Raelene Goodwin, Weisong Liu, Honghuang Lin, Hong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Early prediction of Alzheimer's disease (AD) is crucial for timely
+intervention and treatment. This study aims to use machine learning approaches
+to analyze longitudinal electronic health records (EHRs) of patients with AD
+and identify signs and symptoms that can predict AD onset earlier. We used a
+case-control design with longitudinal EHRs from the U.S. Department of Veterans
+Affairs Veterans Health Administration (VHA) from 2004 to 2021. Cases were VHA
+patients with AD diagnosed after 1/1/2016 based on ICD-10-CM codes, matched 1:9
+with controls by age, sex and clinical utilization with replacement. We used a
+panel of AD-related keywords and their occurrences over time in a patient's
+longitudinal EHRs as predictors for AD prediction with four machine learning
+models. We performed subgroup analyses by age, sex, and race/ethnicity, and
+validated the model in a hold-out and "unseen" VHA stations group. Model
+discrimination, calibration, and other relevant metrics were reported for
+predictions up to ten years before ICD-based diagnosis. The study population
+included 16,701 cases and 39,097 matched controls. The average number of
+AD-related keywords (e.g., "concentration", "speaking") per year increased
+rapidly for cases as diagnosis approached, from around 10 to over 40, while
+remaining flat at 10 for controls. The best model achieved high discriminative
+accuracy (ROCAUC 0.997) for predictions using data from at least ten years
+before ICD-based diagnoses. The model was well-calibrated (Hosmer-Lemeshow
+goodness-of-fit p-value = 0.99) and consistent across subgroups of age, sex and
+race/ethnicity, except for patients younger than 65 (ROCAUC 0.746). Machine
+learning models using AD-related keywords identified from EHR notes can predict
+future AD diagnoses, suggesting its potential use for identifying AD risk using
+EHR notes, offering an affordable way for early screening on large population.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Right for the Wrong Reason: Can Interpretable ML Techniques Detect
+  Spurious Correlations? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Susu Sun, Lisa M. Koch, Christian F. Baumgartner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While deep neural network models offer unmatched classification performance,
+they are prone to learning spurious correlations in the data. Such dependencies
+on confounding information can be difficult to detect using performance metrics
+if the test data comes from the same distribution as the training data.
+Interpretable ML methods such as post-hoc explanations or inherently
+interpretable classifiers promise to identify faulty model reasoning. However,
+there is mixed evidence whether many of these techniques are actually able to
+do so. In this paper, we propose a rigorous evaluation strategy to assess an
+explanation technique's ability to correctly identify spurious correlations.
+Using this strategy, we evaluate five post-hoc explanation techniques and one
+inherently interpretable method for their ability to detect three types of
+artificially added confounders in a chest x-ray diagnosis task. We find that
+the post-hoc technique SHAP, as well as the inherently interpretable Attri-Net
+provide the best performance and can be used to reliably identify faulty model
+behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Learning for Audio-Based Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peranut Nimitsurachat, Peter Washington
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emotion recognition models using audio input data can enable the development
+of interactive systems with applications in mental healthcare, marketing,
+gaming, and social media analysis. While the field of affective computing using
+audio data is rich, a major barrier to achieve consistently high-performance
+models is the paucity of available training labels. Self-supervised learning
+(SSL) is a family of methods which can learn despite a scarcity of supervised
+labels by predicting properties of the data itself. To understand the utility
+of self-supervised learning for audio-based emotion recognition, we have
+applied self-supervised learning pre-training to the classification of emotions
+from the CMU- MOSEI's acoustic modality. Unlike prior papers that have
+experimented with raw acoustic data, our technique has been applied to encoded
+acoustic data. Our model is first pretrained to uncover the randomly-masked
+timestamps of the acoustic data. The pre-trained model is then fine-tuned using
+a small sample of annotated data. The performance of the final model is then
+evaluated via several evaluation metrics against a baseline deep learning model
+with an identical backbone architecture. We find that self-supervised learning
+consistently improves the performance of the model across all metrics. This
+work shows the utility of self-supervised learning for affective computing,
+demonstrating that self-supervised learning is most useful when the number of
+training examples is small, and that the effect is most pronounced for emotions
+which are easier to classify such as happy, sad and anger. This work further
+demonstrates that self-supervised learning works when applied to embedded
+feature representations rather than the traditional approach of pre-training on
+the raw input space.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 9 figures, submitted to IEEE Transactions on Affective
+  Computing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rapid detection of soil carbonates by means of NIR spectroscopy, deep
+  learning methods and phase quantification by powder Xray diffraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lykourgos Chiniadis, Petros Tamvakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Soil NIR spectral absorbance/reflectance libraries are utilized towards
+improving agricultural production and analysis of soil properties which are key
+prerequisite for agroecological balance and environmental sustainability.
+Carbonates in particular, represent a soil property which is mostly affected
+even by mild, let alone extreme, changes of environmental conditions during
+climate change. In this study we propose a rapid and efficient way to predict
+carbonates content in soil by means of FT NIR reflectance spectroscopy and by
+use of deep learning methods. We exploited multiple machine learning methods,
+such as: 1) a MLP Regressor and 2) a CNN and compare their performance with
+other traditional ML algorithms such as PLSR, Cubist and SVM on the combined
+dataset of two NIR spectral libraries: KSSL (USDA), a dataset of soil samples
+reflectance spectra collected nationwide, and LUCAS TopSoil (European Soil
+Library) which contains soil sample absorbance spectra from all over the
+European Union, and use them to predict carbonate content on never before seen
+soil samples. Soil samples in KSSL and in TopSoil spectral libraries were
+acquired in the spectral region of visNIR, however in this study, only the NIR
+spectral region was utilized. Quantification of carbonates by means of Xray
+Diffraction is in good agreement with the volumetric method and the MLP
+prediction. Our work contributes to rapid carbonates content prediction in soil
+samples in cases where: 1) no volumetric method is available and 2) only NIR
+spectra absorbance data are available. Up till now and to the best of our
+knowledge, there exists no other study, that presents a prediction model
+trained on such an extensive dataset with such promising results on unseen
+data, undoubtedly supporting the notion that deep learning models present
+excellent prediction tools for soil carbonates content.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TabADM: Unsupervised Tabular Anomaly Detection with Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guy Zamberg, Moshe Salhov, Ofir Lindenbaum, Amir Averbuch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tables are an abundant form of data with use cases across all scientific
+fields. Real-world datasets often contain anomalous samples that can negatively
+affect downstream analysis. In this work, we only assume access to contaminated
+data and present a diffusion-based probabilistic model effective for
+unsupervised anomaly detection. Our model is trained to learn the density of
+normal samples by utilizing a unique rejection scheme to attenuate the
+influence of anomalies on the density estimation. At inference, we identify
+anomalies as samples in low-density regions. We use real data to demonstrate
+that our method improves detection capabilities over baselines. Furthermore,
+our method is relatively stable to the dimension of the data and does not
+require extensive hyperparameter tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An axiomatized PDE model of deep neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tangjun Wang, Wenqi Tao, Chenglong Bao, Zuoqiang Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inspired by the relation between deep neural network (DNN) and partial
+differential equations (PDEs), we study the general form of the PDE models of
+deep neural networks. To achieve this goal, we formulate DNN as an evolution
+operator from a simple base model. Based on several reasonable assumptions, we
+prove that the evolution operator is actually determined by
+convection-diffusion equation. This convection-diffusion equation model gives
+mathematical explanation for several effective networks. Moreover, we show that
+the convection-diffusion model improves the robustness and reduces the
+Rademacher complexity. Based on the convection-diffusion equation, we design a
+new training method for ResNets. Experiments validate the performance of the
+proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tackling the Curse of Dimensionality with Physics-Informed Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12306v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12306v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheyuan Hu, Khemraj Shukla, George Em Karniadakis, Kenji Kawaguchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The curse-of-dimensionality (CoD) taxes computational resources heavily with
+exponentially increasing computational cost as the dimension increases. This
+poses great challenges in solving high-dimensional PDEs as Richard Bellman
+first pointed out over 60 years ago. While there has been some recent success
+in solving numerically partial differential equations (PDEs) in high
+dimensions, such computations are prohibitively expensive, and true scaling of
+general nonlinear PDEs to high dimensions has never been achieved. In this
+paper, we develop a new method of scaling up physics-informed neural networks
+(PINNs) to solve arbitrary high-dimensional PDEs. The new method, called
+Stochastic Dimension Gradient Descent (SDGD), decomposes a gradient of PDEs
+into pieces corresponding to different dimensions and samples randomly a subset
+of these dimensional pieces in each iteration of training PINNs. We
+theoretically prove the convergence guarantee and other desired properties of
+the proposed method. We experimentally demonstrate that the proposed method
+allows us to solve many notoriously hard high-dimensional PDEs, including the
+Hamilton-Jacobi-Bellman and the Schr\"{o}dinger equations in thousands of
+dimensions very fast on a single GPU using the PINNs mesh-free approach. For
+example, we solve nontrivial nonlinear PDEs (the HJB-Lin equation and the BSB
+equation) in 100,000 dimensions in 6 hours on a single GPU using SDGD with
+PINNs. Since SDGD is a general training methodology of PINNs, SDGD can be
+applied to any current and future variants of PINNs to scale them up for
+arbitrary high-dimensional PDEs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics-Informed Machine Learning of Argon Gas-Driven Melt Pool Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        R. Sharma, W. Grace Guo, M. Raissi, Y. B. Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Melt pool dynamics in metal additive manufacturing (AM) is critical to
+process stability, microstructure formation, and final properties of the
+printed materials. Physics-based simulation including computational fluid
+dynamics (CFD) is the dominant approach to predict melt pool dynamics. However,
+the physics-based simulation approaches suffer from the inherent issue of very
+high computational cost. This paper provides a physics-informed machine
+learning (PIML) method by integrating neural networks with the governing
+physical laws to predict the melt pool dynamics such as temperature, velocity,
+and pressure without using any training data on velocity. This approach avoids
+solving the highly non-linear Navier-Stokes equation numerically, which
+significantly reduces the computational cost. The difficult-to-determine model
+constants of the governing equations of the melt pool can also be inferred
+through data-driven discovery. In addition, the physics-informed neural network
+(PINN) architecture has been optimized for efficient model training. The
+data-efficient PINN model is attributed to the soft penalty by incorporating
+governing partial differential equations (PDEs), initial conditions, and
+boundary conditions in the PINN model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RANSAC-NN: Unsupervised Image Outlier Detection using RANSAC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12301v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12301v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen-Han Tsai, Yu-Shao Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image outlier detection (OD) is crucial for ensuring the quality and accuracy
+of image datasets used in computer vision tasks. The majority of OD algorithms,
+however, have not been targeted toward image data. Consequently, the results of
+applying such algorithms to images are often suboptimal. In this work, we
+propose RANSAC-NN, a novel unsupervised OD algorithm specifically designed for
+images. By comparing images in a RANSAC-based approach, our algorithm
+automatically predicts the outlier score of each image without additional
+training or label information. We evaluate RANSAC-NN against state-of-the-art
+OD algorithms on 15 diverse datasets. Without any hyperparameter tuning,
+RANSAC-NN consistently performs favorably in contrast to other algorithms in
+almost every dataset category. Furthermore, we provide a detailed analysis to
+understand each RANSAC-NN component, and we demonstrate its potential
+applications in image mislabeled detection. Code for RANSAC-NN is provided at
+https://github.com/mxtsai/ransac-nn
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative analysis using classification methods versus early stage
+  diabetes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12296v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12296v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alca-Vilca Gabriel Anthony, Carpio-Vargas Eloy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this research work, a comparative analysis was carried out using
+classification methods such as: Discriminant Analysis and Logistic Regression
+to subsequently predict whether a person may have the presence of early stage
+diabetes. For this purpose, use was made of a database of the UC IRVINE
+platform of the year 2020 where specific variables that influence diabetes were
+used for a better result. Likewise in terms of methodology, the corresponding
+analysis was performed for each of the 3 classification methods and then take
+them to a comparative table and analyze the results obtained. Finally we can
+add that the majority of the studies carried out applying the classification
+methods to the diseases can be clearly seen that there is a certain attachment
+and more use of the logistic regression classification method, on the other
+hand, in the results we could see significant differences in terms of the 2
+classification methods that were applied, which was valuable information for
+later drawing final conclusions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ResWCAE: Biometric Pattern Image Denoising Using Residual
+  Wavelet-Conditioned Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youzhi Liang, Wen Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The utilization of biometric authentication with pattern images is
+increasingly popular in compact Internet of Things (IoT) devices. However, the
+reliability of such systems can be compromised by image quality issues,
+particularly in the presence of high levels of noise. While state-of-the-art
+deep learning algorithms designed for generic image denoising have shown
+promise, their large number of parameters and lack of optimization for unique
+biometric pattern retrieval make them unsuitable for these devices and
+scenarios. In response to these challenges, this paper proposes a lightweight
+and robust deep learning architecture, the Residual Wavelet-Conditioned
+Convolutional Autoencoder (Res-WCAE) with a Kullback-Leibler divergence (KLD)
+regularization, designed specifically for fingerprint image denoising. Res-WCAE
+comprises two encoders - an image encoder and a wavelet encoder - and one
+decoder. Residual connections between the image encoder and decoder are
+leveraged to preserve fine-grained spatial features, where the bottleneck layer
+conditioned on the compressed representation of features obtained from the
+wavelet encoder using approximation and detail subimages in the
+wavelet-transform domain. The effectiveness of Res-WCAE is evaluated against
+several state-of-the-art denoising methods, and the experimental results
+demonstrate that Res-WCAE outperforms these methods, particularly for heavily
+degraded fingerprint images in the presence of high levels of noise. Overall,
+Res-WCAE shows promise as a solution to the challenges faced by biometric
+authentication systems in compact IoT devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Drug Repurposing Targeting COVID-19 3CL Protease using Molecular Docking
+  and Machine Learning Regression Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18088v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18088v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Imra Aqeel, Abdul Majid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The COVID-19 pandemic has created a global health crisis, with an urgent need
+for effective treatments. Drug repurposing has emerged as a promising solution,
+as it can save time, cost, and labor. However, the number of identified
+repurposed drugs for COVID-19 treatment remains limited, and there is a need
+for more efficient and comprehensive drug repurposing approaches. In this
+study, we aimed to identify potential therapeutic candidates for COVID-19
+treatment through drug repurposing using a combination of molecular docking and
+machine learning regression approaches. We utilized the Zinc database to screen
+5903 World-approved drugs for their potential to target the main protease 3CL
+of SARS-CoV-2, which is a key enzyme in the replication of the virus. We
+performed molecular docking to evaluate the binding affinity of the drugs to
+the main protease 3CL, and used several machine learning regression approaches
+for QSAR modeling to identify drugs with high binding affinity. Our results
+showed that the Decision Tree Regression (DTR) model had the best statistical
+measures of R2 and RMSE, and we shortlisted six promising drugs within the
+range of -15 kcal/mol to -13 kcal/mol. These drugs have novel repurposing
+potential, except for one antiviral ZINC203757351 compound that has already
+been identified in other studies. We further analyzed the physiochemical and
+pharmacokinetic properties of these top-ranked selected drugs and their best
+binding interaction for specific target protease 3CLpro. Our study provides an
+efficient framework for drug repurposing against COVID-19, and demonstrates the
+potential of combining molecular docking with machine learning regression
+approaches to accelerate the identification of potential therapeutic
+candidates. Our findings contribute to the larger goal of finding effective
+treatments for COVID-19, which is a critical global health challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 Pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Generalizability of Graph Anomaly Detection Models via Data
+  Augmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.10168v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.10168v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuang Zhou, Xiao Huang, Ninghao Liu, Fu-Lai Chung, Long-Kai Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph anomaly detection (GAD) is a vital task since even a few anomalies can
+pose huge threats to benign users. Recent semi-supervised GAD methods, which
+can effectively leverage the available labels as prior knowledge, have achieved
+superior performances than unsupervised methods. In practice, people usually
+need to identify anomalies on new (sub)graphs to secure their business, but
+they may lack labels to train an effective detection model. One natural idea is
+to directly adopt a trained GAD model to the new (sub)graph for testing.
+However, we find that existing semi-supervised GAD methods suffer from poor
+generalization issue, i.e., well-trained models could not perform well on an
+unseen area (i.e., not accessible in training) of the same graph. It may cause
+great troubles. In this paper, we base on the phenomenon and propose a general
+and novel research problem of generalized graph anomaly detection that aims to
+effectively identify anomalies on both the training-domain graph and unseen
+testing graph to eliminate potential dangers. Nevertheless, it is a challenging
+task since only limited labels are available, and the normal background may
+differ between training and testing data. Accordingly, we propose a data
+augmentation method named \textit{AugAN} (\uline{Aug}mentation for
+\uline{A}nomaly and \uline{N}ormal distributions) to enrich training data and
+boost the generalizability of GAD models. Experiments verify the effectiveness
+of our method in improving model generalizability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The updated version is accepted by TKDE 2023. Please refer to
+  arXiv:2306.10534v1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TriMLP: Revenge of a MLP-like Architecture in Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14675v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14675v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiheng Jiang, Yuanbo Xu, Yongjian Yang, Funing Yang, Pengyang Wang, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation models sequences of historical user-item
+interactive behaviors (or referred as token) to better infer dynamic
+preferences. Fueled by the improved neural network architectures such as RNN,
+CNN and Transformer, this field has enjoyed rapid performance boost in the past
+years. Recent progress on all-MLP models lights on an efficient method with
+less intensive computation, token-mixing MLP, to learn the transformation
+patterns among historical behaviors. However, due to the inherent
+fully-connection design that allows the unrestricted cross-token communication
+and ignores the chronological order, we find that directly applying
+token-mixing MLP into sequential recommendation leads to subpar performance. In
+this paper, we present a purely MLP-based sequential recommendation
+architecture TriMLP with a novel \underline{Tri}angular Mixer where the
+modified \underline{MLP} endows tokens with ordered interactions. As the
+cross-token interaction in MLP is actually matrix multiplication, Triangular
+Mixer drops the lower-triangle neurons in the weight matrix and thus blocks the
+connections from future tokens, which prevents information leakage and improves
+prediction capability under the standard auto-regressive training fashion. To
+further model long and short-term preferences on fine-grained level, the mixer
+adopts a dual-branch structure based on the delicate MLP described above,
+namely global and local mixing, to separately capture the sequential long-range
+dependencies and local patterns. Empirical study on 9 different scale datasets
+(contain 50K\textasciitilde20M behaviors) of various benchmarks, including
+MovieLens, Amazon and Tenrec, demonstrates that TriMLP attains promising and
+stable accuracy/efficiency trade-off, i.e., averagely surpasses several
+state-of-the-art baselines by 5.32\% and saves 8.44\% inference time cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 9 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TF-GNN: Graph Neural Networks in TensorFlow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.03522v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.03522v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleksandr Ferludin, Arno Eigenwillig, Martin Blais, Dustin Zelle, Jan Pfeifer, Alvaro Sanchez-Gonzalez, Wai Lok Sibon Li, Sami Abu-El-Haija, Peter Battaglia, Neslihan Bulut, Jonathan Halcrow, Filipe Miguel Gonçalves de Almeida, Pedro Gonnet, Liangze Jiang, Parth Kothari, Silvio Lattanzi, André Linhares, Brandon Mayer, Vahab Mirrokni, John Palowitch, Mihir Paradkar, Jennifer She, Anton Tsitsulin, Kevin Villela, Lisa Wang, David Wong, Bryan Perozzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  TensorFlow-GNN (TF-GNN) is a scalable library for Graph Neural Networks in
+TensorFlow. It is designed from the bottom up to support the kinds of rich
+heterogeneous graph data that occurs in today's information ecosystems. In
+addition to enabling machine learning researchers and advanced developers,
+TF-GNN offers low-code solutions to empower the broader developer community in
+graph learning. Many production models at Google use TF-GNN, and it has been
+recently released as an open source project. In this paper we describe the
+TF-GNN data model, its Keras message passing API, and relevant capabilities
+such as graph sampling and distributed training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Graph Generation to Graph Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07989v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07989v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oliver Schulte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This note describes a new approach to classifying graphs that leverages graph
+generative models (GGM). Assuming a GGM that defines a joint probability
+distribution over graphs and their class labels, I derive classification
+formulas for the probability of a class label given a graph. A new conditional
+ELBO can be used to train a generative graph auto-encoder model for
+discrimination. While leveraging generative models for classification has been
+well explored for non-relational i.i.d. data, to our knowledge it is a novel
+approach to graph classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>I welcome suggestions, comments, and proposals for collaboration to
+  develop further the ideas in this paper. Please email oschulte@cs.sfu.ca. I
+  am grateful to Renjie Liao for helpful comments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When Personalization Harms: Reconsidering the Use of Group Attributes in
+  Prediction <span class="chip">ICML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02058v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02058v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vinith M. Suriyakumar, Marzyeh Ghassemi, Berk Ustun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models are often personalized with categorical attributes
+that are protected, sensitive, self-reported, or costly to acquire. In this
+work, we show models that are personalized with group attributes can reduce
+performance at a group level. We propose formal conditions to ensure the "fair
+use" of group attributes in prediction tasks by training one additional model
+-- i.e., collective preference guarantees to ensure that each group who
+provides personal data will receive a tailored gain in performance in return.
+We present sufficient conditions to ensure fair use in empirical risk
+minimization and characterize failure modes that lead to fair use violations
+due to standard practices in model development and deployment. We present a
+comprehensive empirical study of fair use in clinical prediction tasks. Our
+results demonstrate the prevalence of fair use violations in practice and
+illustrate simple interventions to mitigate their harm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2023 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Active Learning on Heteroskedastic Distributions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.00928v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.00928v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Savya Khosla, Chew Kin Whye, Jordan T. Ash, Cyril Zhang, Kenji Kawaguchi, Alex Lamb
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models that can actively seek out the best quality training data hold the
+promise of more accurate, adaptable, and efficient machine learning. Active
+learning techniques often tend to prefer examples that are the most difficult
+to classify. While this works well on homogeneous datasets, we find that it can
+lead to catastrophic failures when performed on multiple distributions with
+different degrees of label noise or heteroskedasticity. These active learning
+algorithms strongly prefer to draw from the distribution with more noise, even
+if their examples have no informative structure (such as solid color images
+with random labels). To this end, we demonstrate the catastrophic failure of
+these active learning algorithms on heteroskedastic distributions and propose a
+fine-tuning-based approach to mitigate these failures. Further, we propose a
+new algorithm that incorporates a model difference scoring function for each
+data point to filter out the noisy examples and sample clean examples that
+maximize accuracy, outperforming the existing active learning techniques on the
+heteroskedastic datasets. We hope these observations and techniques are
+immediately helpful to practitioners and can help to challenge common
+assumptions in the design of active learning algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Toward Efficient Gradient-Based Value Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.13757v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.13757v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arsalan Sharifnassab, Richard Sutton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient-based methods for value estimation in reinforcement learning have
+favorable stability properties, but they are typically much slower than
+Temporal Difference (TD) learning methods. We study the root causes of this
+slowness and show that Mean Square Bellman Error (MSBE) is an ill-conditioned
+loss function in the sense that its Hessian has large condition-number. To
+resolve the adverse effect of poor conditioning of MSBE on gradient based
+methods, we propose a low complexity batch-free proximal method that
+approximately follows the Gauss-Newton direction and is asymptotically robust
+to parameterization. Our main algorithm, called RANS, is efficient in the sense
+that it is significantly faster than the residual gradient methods while having
+almost the same computational complexity, and is competitive with TD on the
+classic problems that we tested.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Log-linear Guardedness and its Implications <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.10012v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.10012v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shauli Ravfogel, Yoav Goldberg, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods for erasing human-interpretable concepts from neural representations
+that assume linearity have been found to be tractable and useful. However, the
+impact of this removal on the behavior of downstream classifiers trained on the
+modified representations is not fully understood. In this work, we formally
+define the notion of log-linear guardedness as the inability of an adversary to
+predict the concept directly from the representation, and study its
+implications. We show that, in the binary case, under certain assumptions, a
+downstream log-linear model cannot recover the erased concept. However, we
+demonstrate that a multiclass log-linear model \emph{can} be constructed that
+indirectly recovers the concept in some cases, pointing to the inherent
+limitations of log-linear guardedness as a downstream bias mitigation
+technique. These findings shed light on the theoretical limitations of linear
+erasure methods and highlight the need for further research on the connections
+between intrinsic and extrinsic bias in neural models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a long paper in ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ManimML: Communicating Machine Learning Architectures with Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17108v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17108v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alec Helbling, Duen Horng Chau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been an explosion in interest in machine learning (ML) in recent
+years due to its applications to science and engineering. However, as ML
+techniques have advanced, tools for explaining and visualizing novel ML
+algorithms have lagged behind. Animation has been shown to be a powerful tool
+for making engaging visualizations of systems that dynamically change over
+time, which makes it well suited to the task of communicating ML algorithms.
+However, the current approach to animating ML algorithms is to handcraft
+applications that highlight specific algorithms or use complex generalized
+animation software. We developed ManimML, an open-source Python library for
+easily generating animations of ML algorithms directly from code. We sought to
+leverage ML practitioners' preexisting knowledge of programming rather than
+requiring them to learn complex animation software. ManimML has a familiar
+syntax for specifying neural networks that mimics popular deep learning
+frameworks like Pytorch. A user can take a preexisting neural network
+architecture and easily write a specification for an animation in ManimML,
+which will then automatically compose animations for different components of
+the system into a final animation of the entire neural network. ManimML is open
+source and available at https://github.com/helblazer811/ManimML.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Some Density Based Clustering Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09256v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09256v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rupanka Bhuyan, Samarjeet Borah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Density Based Clustering are a type of Clustering methods using in data
+mining for extracting previously unknown patterns from data sets. There are a
+number of density based clustering methods such as DBSCAN, OPTICS, DENCLUE,
+VDBSCAN, DVBSCAN, DBCLASD and ST-DBSCAN. In this paper, a study of these
+methods is done along with their characteristics, advantages and disadvantages
+and most importantly, their applicability to different types of data sets to
+mine useful and appropriate patterns.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 1 figure, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning Models for Water Stage Predictions in South Florida 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15907v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15907v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jimeng Shi, Zeda Yin, Rukmangadh Myana, Khandker Ishtiaq, Anupama John, Jayantha Obeysekera, Arturo Leon, Giri Narasimhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simulating and predicting water levels in river systems is essential for
+flood warnings, hydraulic operations, and flood mitigations. In the engineering
+field, tools such as HEC-RAS, MIKE, and SWMM are used to build detailed
+physics-based hydrological and hydraulic computational models to simulate the
+entire watershed, thereby predicting the water stage at any point in the
+system. However, these physics-based models are computationally intensive,
+especially for large watersheds and for longer simulations. To overcome this
+problem, we train several deep learning (DL) models for use as surrogate models
+to rapidly predict the water stage. The downstream stage of the Miami River in
+South Florida is chosen as a case study for this paper. The dataset is from
+January 1, 2010, to December 31, 2020, downloaded from the DBHYDRO database of
+the South Florida Water Management District (SFWMD). Extensive experiments show
+that the performance of the DL models is comparable to that of the
+physics-based models, even during extreme precipitation conditions (i.e.,
+tropical storms). Furthermore, we study the decline in prediction accuracy of
+the DL models with an increase in prediction lengths. In order to predict the
+water stage in the future, our DL models use measured variables of the river
+system from the recent past as well as covariates that can be reliably
+predicted in the near future. In summary, the deep learning models achieve
+comparable or better error rates with at least 1000x speedup in comparison to
+the physics-based models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LAno<span class="highlight-title">BERT</span>: System Log Anomaly Detection based on <span class="highlight-title">BERT</span> Masked Language
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.09564v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.09564v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukyung Lee, Jina Kim, Pilsung Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The system log generated in a computer system refers to large-scale data that
+are collected simultaneously and used as the basic data for determining errors,
+intrusion and abnormal behaviors. The aim of system log anomaly detection is to
+promptly identify anomalies while minimizing human intervention, which is a
+critical problem in the industry. Previous studies performed anomaly detection
+through algorithms after converting various forms of log data into a
+standardized template using a parser. Particularly, a template corresponding to
+a specific event should be defined in advance for all the log data using which
+the information within the log key may get lost. In this study, we propose
+LAnoBERT, a parser free system log anomaly detection method that uses the BERT
+model, exhibiting excellent natural language processing performance. The
+proposed method, LAnoBERT, learns the model through masked language modeling,
+which is a BERT-based pre-training method, and proceeds with unsupervised
+learning-based anomaly detection using the masked language modeling loss
+function per log key during the test process. In addition, we also propose an
+efficient inference process to establish a practically applicable pipeline to
+the actual system. Experiments on three well-known log datasets, i.e., HDFS,
+BGL, and Thunderbird, show that not only did LAnoBERT yield a higher anomaly
+detection performance compared to unsupervised learning-based benchmark models,
+but also it resulted in a comparable performance with supervised learning-based
+benchmark models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLIPTER: Looking at the Bigger Picture in Scene Text Recognition <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.07464v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.07464v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aviad Aberdam, David Bensaïd, Alona Golts, Roy Ganz, Oren Nuriel, Royee Tichauer, Shai Mazor, Ron Litman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reading text in real-world scenarios often requires understanding the context
+surrounding it, especially when dealing with poor-quality text. However,
+current scene text recognizers are unaware of the bigger picture as they
+operate on cropped text images. In this study, we harness the representative
+capabilities of modern vision-language models, such as CLIP, to provide
+scene-level information to the crop-based recognizer. We achieve this by fusing
+a rich representation of the entire image, obtained from the vision-language
+model, with the recognizer word-level features via a gated cross-attention
+mechanism. This component gradually shifts to the context-enhanced
+representation, allowing for stable fine-tuning of a pretrained recognizer. We
+demonstrate the effectiveness of our model-agnostic framework, CLIPTER (CLIP
+TExt Recognition), on leading text recognition architectures and achieve
+state-of-the-art results across multiple benchmarks. Furthermore, our analysis
+highlights improved robustness to out-of-vocabulary words and enhanced
+generalization in low-data regimes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sentiment<span class="highlight-title">GPT</span>: Exploiting <span class="highlight-title">GPT</span> for Advanced Sentiment Analysis and its
+  Departure from Current Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10234v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10234v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kiana Kheiri, Hamid Karimi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a thorough examination of various Generative Pretrained
+Transformer (GPT) methodologies in sentiment analysis, specifically in the
+context of Task 4 on the SemEval 2017 dataset. Three primary strategies are
+employed: 1) prompt engineering using the advanced GPT-3.5 Turbo, 2)
+fine-tuning GPT models, and 3) an inventive approach to embedding
+classification. The research yields detailed comparative insights among these
+strategies and individual GPT models, revealing their unique strengths and
+potential limitations. Additionally, the study compares these GPT-based
+methodologies with other current, high-performing models previously used with
+the same dataset. The results illustrate the significant superiority of the GPT
+approaches in terms of predictive performance, more than 22\% in F1-score
+compared to the state-of-the-art. Further, the paper sheds light on common
+challenges in sentiment analysis tasks, such as understanding context and
+detecting sarcasm. It underscores the enhanced capabilities of the GPT models
+to effectively handle these complexities. Taken together, these findings
+highlight the promising potential of GPT models in sentiment analysis, setting
+the stage for future research in this field. The code can be found at
+https://github.com/DSAatUSU/SentimentGPT
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Frouros: A Python library for drift detection in machine learning
+  systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.06868v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.06868v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaime Céspedes-Sisniega, Álvaro López-García
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Frouros is an open-source Python library capable of detecting drift in
+machine learning systems. It provides a combination of classical and more
+recent algorithms for drift detection: both concept and data drift. We have
+designed it with the objective of making it compatible with any machine
+learning framework and easily adaptable to real-world use cases. The library is
+developed following a set of best development and continuous integration
+practices to ensure ease of maintenance and extensibility. The source code is
+available at https://github.com/IFCA/frouros.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Estimate-Then-Optimize versus Integrated-Estimation-Optimization versus
+  Sample Average Approximation: A Stochastic Dominance Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06833v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06833v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam N. Elmachtoub, Henry Lam, Haofeng Zhang, Yunfan Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In data-driven stochastic optimization, model parameters of the underlying
+distribution need to be estimated from data in addition to the optimization
+task. Recent literature considers integrating the estimation and optimization
+processes by selecting model parameters that lead to the best empirical
+objective performance. This integrated approach, which we call
+integrated-estimation-optimization (IEO), can be readily shown to outperform
+simple estimate-then-optimize (ETO) when the model is misspecified. In this
+paper, we show that a reverse behavior appears when the model class is
+well-specified and there is sufficient data. Specifically, for a general class
+of nonlinear stochastic optimization problems, we show that simple ETO
+outperforms IEO asymptotically when the model class covers the ground truth, in
+the strong sense of stochastic dominance of the regret. Namely, the entire
+distribution of the regret, not only its mean or other moments, is always
+better for ETO compared to IEO. Our results also apply to constrained,
+contextual optimization problems where the decision depends on observed
+features. Whenever applicable, we also demonstrate how standard sample average
+approximation (SAA) performs the worst when the model class is well-specified
+in terms of regret, and best when it is misspecified. Finally, we provide
+experimental results to support our theoretical comparisons and illustrate when
+our insights hold in finite-sample regimes and under various degrees of
+misspecification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MGR: Multi-generator Based Rationalization <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.04492v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.04492v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Liu, Haozhao Wang, Jun Wang, Ruixuan Li, Xinyang Li, Yuankai Zhang, Yang Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rationalization is to employ a generator and a predictor to construct a
+self-explaining NLP model in which the generator selects a subset of
+human-intelligible pieces of the input text to the following predictor.
+However, rationalization suffers from two key challenges, i.e., spurious
+correlation and degeneration, where the predictor overfits the spurious or
+meaningless pieces solely selected by the not-yet well-trained generator and in
+turn deteriorates the generator. Although many studies have been proposed to
+address the two challenges, they are usually designed separately and do not
+take both of them into account. In this paper, we propose a simple yet
+effective method named MGR to simultaneously solve the two problems. The key
+idea of MGR is to employ multiple generators such that the occurrence stability
+of real pieces is improved and more meaningful pieces are delivered to the
+predictor. Empirically, we show that MGR improves the F1 score by up to 20.9%
+as compared to state-of-the-art methods. Codes are available at
+https://github.com/jugechengzi/Rationalization-MGR .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2023, oral presentation. Fixed some typos and clarified some
+  implementation details. arXiv admin note: text overlap with arXiv:2209.08285</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Net<span class="highlight-title">GPT</span>: A Native-AI Network Architecture Beyond Provisioning
+  Personalized Generative Services 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.06148v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.06148v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Chen, Rongpeng Li, Zhifeng Zhao, Chenghui Peng, Jianjun Wu, Ekram Hossain, Honggang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have triggered tremendous success to empower
+daily life by generative information, and the personalization of LLMs could
+further contribute to their applications due to better alignment with human
+intents. Towards personalized generative services, a collaborative cloud-edge
+methodology sounds promising, as it facilitates the effective orchestration of
+heterogeneous distributed communication and computing resources. In this
+article, after discussing the pros and cons of several candidate cloud-edge
+collaboration techniques, we put forward NetGPT to capably deploy appropriate
+LLMs at the edge and the cloud in accordance with their computing capacity. In
+addition, edge LLMs could efficiently leverage location-based information for
+personalized prompt completion, thus benefiting the interaction with cloud
+LLMs. After deploying representative open-source LLMs (e.g., GPT-2-base and
+LLaMA model) at the edge and the cloud, we present the feasibility of NetGPT on
+the basis of low-rank adaptation-based light-weight fine-tuning. Subsequently,
+we highlight substantial essential changes required for a native artificial
+intelligence (AI) network architecture towards NetGPT, with special emphasis on
+deeper integration of communications and computing resources and careful
+calibration of logical AI workflow. Furthermore, we demonstrate several
+by-product benefits of NetGPT, given edge LLM's astonishing capability to
+predict trends and infer intents, which possibly leads to a unified solution
+for intelligent network management \& orchestration. In a nutshell, we argue
+that NetGPT is a promising native-AI network architecture beyond provisioning
+personalized generative services.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">2</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Launchpad<span class="highlight-title">GPT</span>: Language Model as Music Visualization Designer on
+  Launchpad 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.04827v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.04827v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siting Xu, Yunlong Tang, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Launchpad is a musical instrument that allows users to create and perform
+music by pressing illuminated buttons. To assist and inspire the design of the
+Launchpad light effect, and provide a more accessible approach for beginners to
+create music visualization with this instrument, we proposed the LaunchpadGPT
+model to generate music visualization designs on Launchpad automatically. Based
+on the language model with excellent generation ability, our proposed
+LaunchpadGPT takes an audio piece of music as input and outputs the lighting
+effects of Launchpad-playing in the form of a video (Launchpad-playing video).
+We collect Launchpad-playing videos and process them to obtain music and
+corresponding video frame of Launchpad-playing as prompt-completion pairs, to
+train the language model. The experiment result shows the proposed method can
+create better music visualization than random generation methods and hold the
+potential for a broader range of music visualization applications. Our code is
+available at https://github.com/yunlong10/LaunchpadGPT/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Computer Music Conference (ICMC) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLIPSonic: Text-to-Audio Synthesis with Unlabeled Videos and <span class="highlight-title">Pretrain</span>ed
+  Language-Vision Models <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09635v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09635v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao-Wen Dong, Xiaoyu Liu, Jordi Pons, Gautam Bhattacharya, Santiago Pascual, Joan Serrà, Taylor Berg-Kirkpatrick, Julian McAuley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has studied text-to-audio synthesis using large amounts of paired
+text-audio data. However, audio recordings with high-quality text annotations
+can be difficult to acquire. In this work, we approach text-to-audio synthesis
+using unlabeled videos and pretrained language-vision models. We propose to
+learn the desired text-audio correspondence by leveraging the visual modality
+as a bridge. We train a conditional diffusion model to generate the audio track
+of a video, given a video frame encoded by a pretrained contrastive
+language-image pretraining (CLIP) model. At test time, we first explore
+performing a zero-shot modality transfer and condition the diffusion model with
+a CLIP-encoded text query. However, we observe a noticeable performance drop
+with respect to image queries. To close this gap, we further adopt a pretrained
+diffusion prior model to generate a CLIP image embedding given a CLIP text
+embedding. Our results show the effectiveness of the proposed method, and that
+the pretrained diffusion prior can reduce the modality transfer gap. While we
+focus on text-to-audio synthesis, the proposed model can also generate audio
+from image queries, and it shows competitive performance against a
+state-of-the-art image-to-audio synthesis model in a subjective listening test.
+This study offers a new direction of approaching text-to-audio synthesis that
+leverages the naturally-occurring audio-visual correspondence in videos and the
+power of pretrained language-vision models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by WASPAA 2023. Demo:
+  https://salu133445.github.io/clipsonic/</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+
+</body>
+
+<footer>
+    <div>
+        <time id="build-timestamp" datetime="2023-07-31T05:21:31.050595974Z">
+            2023-07-31 05:21:31 UTC
+        </time>
+    </div>
+</footer>
+<script src="index.js"></script>
+</html>
diff --git a/index.js b/index.js
new file mode 100644
index 00000000..69f5da7b
--- /dev/null
+++ b/index.js
@@ -0,0 +1,39 @@
+/* Exapand/Collapse with TAB key */
+var expanded = false;
+document.onkeydown = function (e) {
+    if (e.keyCode === 9) {
+        expanded = !expanded;
+        document.querySelectorAll("details").forEach(detail => detail.open = expanded);
+        return false;
+    }
+};
+
+/* Switch Theme */
+const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]');
+
+function switchTheme(e) {
+    if (e.target.checked) {
+        document.documentElement.setAttribute('data-theme', 'light');
+        document.getElementById("theme-icon").className = "ri-sun-line";
+        localStorage.setItem('theme', 'light'); //add this
+    } else {
+        document.documentElement.setAttribute('data-theme', 'dark');
+        document.getElementById("theme-icon").className = "ri-moon-line";
+        localStorage.setItem('theme', 'dark'); //add this
+    }
+}
+
+toggleSwitch.addEventListener('change', switchTheme, false);
+const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null;
+if (currentTheme) {
+    document.documentElement.setAttribute('data-theme', currentTheme);
+    if (currentTheme === 'light') {
+        toggleSwitch.checked = true;
+    }
+}
+
+const timestamp = document.getElementById("build-timestamp");
+const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString();
+
+const badge = document.getElementById("build-timestamp-badge");
+// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`